Last data update: 2014.03.03

R: Lahman Datasets
LahmanDataR Documentation

Lahman Datasets

Description

This dataset gives a consise description of the data files in the Lahman package. It may be useful for computing on the various files.

Usage

data(LahmanData)

Format

A data frame with 24 observations on the following 5 variables.

file

name of dataset

class

class of dataset

nobs

number of observations

nvar

number of variables

title

dataset title

Details

This dataset is generated using vcdExtra::datasets(package="Lahman") with some post-processing.

Examples

data(LahmanData)

# find ID variables in the datasets
IDvars <- lapply(LahmanData[,"file"], function(x) grep('.*ID$', colnames(get(x)), value=TRUE))
names(IDvars) <- LahmanData[,"file"]
str(IDvars)
# vector of unique ID variables
unique(unlist(IDvars))

# which datasets have playerID?
names(which(sapply(IDvars, function(x) "playerID" %in% x)))

################################################
# Visualize relations among datasets via an MDS
################################################
# jaccard distance between two sets; assure positivity
jaccard <- function(A, B) {
	max(1 - length(intersect(A,B)) / length(union(A,B)), .00001)
}	

distmat <- function(vars, FUN=jaccard) {
	nv <- length(vars)
	d <- matrix(0, nv, nv, dimnames=list(names(vars), names(vars)))
	for(i in 1:nv) {
		for (j in 1:nv) {
			if (i != j) d[i,j] <- FUN(vars[[i]], vars[[j]])
		}
	}
	d
}

# do an MDS on distances
distID <- distmat(IDvars)
config <- cmdscale(distID)

pos=rep(1:4, length=nrow(config))
plot(config[,1], config[,2], xlab = "", ylab = "", asp = 1, axes=FALSE,
	main="MDS of ID variable distances of Lahman tables")
abline(h=0, v=0, col="gray80")
text(config[,1], config[,2], rownames(config), cex = 0.75, pos=pos, xpd=NA)

Results


R version 3.3.1 (2016-06-21) -- "Bug in Your Hair"
Copyright (C) 2016 The R Foundation for Statistical Computing
Platform: x86_64-pc-linux-gnu (64-bit)

R is free software and comes with ABSOLUTELY NO WARRANTY.
You are welcome to redistribute it under certain conditions.
Type 'license()' or 'licence()' for distribution details.

R is a collaborative project with many contributors.
Type 'contributors()' for more information and
'citation()' on how to cite R or R packages in publications.

Type 'demo()' for some demos, 'help()' for on-line help, or
'help.start()' for an HTML browser interface to help.
Type 'q()' to quit R.

> library(Lahman)
> png(filename="/home/ddbj/snapshot/RGM3/R_CC/result/Lahman/LahmanData.Rd_%03d_medium.png", width=480, height=480)
> ### Name: LahmanData
> ### Title: Lahman Datasets
> ### Aliases: LahmanData
> ### Keywords: datasets
> 
> ### ** Examples
> 
> data(LahmanData)
> 
> # find ID variables in the datasets
> IDvars <- lapply(LahmanData[,"file"], function(x) grep('.*ID$', colnames(get(x)), value=TRUE))
> names(IDvars) <- LahmanData[,"file"]
> str(IDvars)
List of 24
 $ AllstarFull        : chr [1:5] "playerID" "yearID" "gameID" "teamID" ...
 $ Appearances        : chr [1:4] "yearID" "teamID" "lgID" "playerID"
 $ AwardsManagers     : chr [1:4] "playerID" "awardID" "yearID" "lgID"
 $ AwardsPlayers      : chr [1:4] "playerID" "awardID" "yearID" "lgID"
 $ AwardsShareManagers: chr [1:4] "awardID" "yearID" "lgID" "playerID"
 $ AwardsSharePlayers : chr [1:4] "awardID" "yearID" "lgID" "playerID"
 $ Batting            : chr [1:4] "playerID" "yearID" "teamID" "lgID"
 $ BattingPost        : chr [1:4] "yearID" "playerID" "teamID" "lgID"
 $ CollegePlaying     : chr [1:3] "playerID" "schoolID" "yearID"
 $ Fielding           : chr [1:4] "playerID" "yearID" "teamID" "lgID"
 $ FieldingOF         : chr [1:2] "playerID" "yearID"
 $ FieldingPost       : chr [1:4] "playerID" "yearID" "teamID" "lgID"
 $ HallOfFame         : chr [1:2] "playerID" "yearID"
 $ Managers           : chr [1:4] "playerID" "yearID" "teamID" "lgID"
 $ ManagersHalf       : chr [1:4] "playerID" "yearID" "teamID" "lgID"
 $ Master             : chr [1:3] "playerID" "retroID" "bbrefID"
 $ Pitching           : chr [1:4] "playerID" "yearID" "teamID" "lgID"
 $ PitchingPost       : chr [1:4] "playerID" "yearID" "teamID" "lgID"
 $ Salaries           : chr [1:4] "yearID" "teamID" "lgID" "playerID"
 $ Schools            : chr "schoolID"
 $ SeriesPost         : chr "yearID"
 $ Teams              : chr [1:5] "yearID" "lgID" "teamID" "franchID" ...
 $ TeamsFranchises    : chr "franchID"
 $ TeamsHalf          : chr [1:4] "yearID" "lgID" "teamID" "divID"
> # vector of unique ID variables
> unique(unlist(IDvars))
 [1] "playerID" "yearID"   "gameID"   "teamID"   "lgID"     "awardID" 
 [7] "schoolID" "retroID"  "bbrefID"  "franchID" "divID"   
> 
> # which datasets have playerID?
> names(which(sapply(IDvars, function(x) "playerID" %in% x)))
 [1] "AllstarFull"         "Appearances"         "AwardsManagers"     
 [4] "AwardsPlayers"       "AwardsShareManagers" "AwardsSharePlayers" 
 [7] "Batting"             "BattingPost"         "CollegePlaying"     
[10] "Fielding"            "FieldingOF"          "FieldingPost"       
[13] "HallOfFame"          "Managers"            "ManagersHalf"       
[16] "Master"              "Pitching"            "PitchingPost"       
[19] "Salaries"           
> 
> ################################################
> # Visualize relations among datasets via an MDS
> ################################################
> # jaccard distance between two sets; assure positivity
> jaccard <- function(A, B) {
+ 	max(1 - length(intersect(A,B)) / length(union(A,B)), .00001)
+ }	
> 
> distmat <- function(vars, FUN=jaccard) {
+ 	nv <- length(vars)
+ 	d <- matrix(0, nv, nv, dimnames=list(names(vars), names(vars)))
+ 	for(i in 1:nv) {
+ 		for (j in 1:nv) {
+ 			if (i != j) d[i,j] <- FUN(vars[[i]], vars[[j]])
+ 		}
+ 	}
+ 	d
+ }
> 
> # do an MDS on distances
> distID <- distmat(IDvars)
> config <- cmdscale(distID)
> 
> pos=rep(1:4, length=nrow(config))
> plot(config[,1], config[,2], xlab = "", ylab = "", asp = 1, axes=FALSE,
+ 	main="MDS of ID variable distances of Lahman tables")
> abline(h=0, v=0, col="gray80")
> text(config[,1], config[,2], rownames(config), cex = 0.75, pos=pos, xpd=NA)
> 
> 
> 
> 
> 
> 
> dev.off()
null device 
          1 
>