Last data update: 2014.03.03
|
R: Lahman Datasets
LahmanData | R Documentation |
Lahman Datasets
Description
This dataset gives a consise description of the data files in the Lahman package.
It may be useful for computing on the various files.
Usage
data(LahmanData)
Format
A data frame with 24 observations on the following 5 variables.
file name of dataset
class class of dataset
nobs number of observations
nvar number of variables
title dataset title
Details
This dataset is generated using vcdExtra::datasets(package="Lahman")
with some post-processing.
Examples
data(LahmanData)
# find ID variables in the datasets
IDvars <- lapply(LahmanData[,"file"], function(x) grep('.*ID$', colnames(get(x)), value=TRUE))
names(IDvars) <- LahmanData[,"file"]
str(IDvars)
# vector of unique ID variables
unique(unlist(IDvars))
# which datasets have playerID?
names(which(sapply(IDvars, function(x) "playerID" %in% x)))
################################################
# Visualize relations among datasets via an MDS
################################################
# jaccard distance between two sets; assure positivity
jaccard <- function(A, B) {
max(1 - length(intersect(A,B)) / length(union(A,B)), .00001)
}
distmat <- function(vars, FUN=jaccard) {
nv <- length(vars)
d <- matrix(0, nv, nv, dimnames=list(names(vars), names(vars)))
for(i in 1:nv) {
for (j in 1:nv) {
if (i != j) d[i,j] <- FUN(vars[[i]], vars[[j]])
}
}
d
}
# do an MDS on distances
distID <- distmat(IDvars)
config <- cmdscale(distID)
pos=rep(1:4, length=nrow(config))
plot(config[,1], config[,2], xlab = "", ylab = "", asp = 1, axes=FALSE,
main="MDS of ID variable distances of Lahman tables")
abline(h=0, v=0, col="gray80")
text(config[,1], config[,2], rownames(config), cex = 0.75, pos=pos, xpd=NA)
Results
R version 3.3.1 (2016-06-21) -- "Bug in Your Hair"
Copyright (C) 2016 The R Foundation for Statistical Computing
Platform: x86_64-pc-linux-gnu (64-bit)
R is free software and comes with ABSOLUTELY NO WARRANTY.
You are welcome to redistribute it under certain conditions.
Type 'license()' or 'licence()' for distribution details.
R is a collaborative project with many contributors.
Type 'contributors()' for more information and
'citation()' on how to cite R or R packages in publications.
Type 'demo()' for some demos, 'help()' for on-line help, or
'help.start()' for an HTML browser interface to help.
Type 'q()' to quit R.
> library(Lahman)
> png(filename="/home/ddbj/snapshot/RGM3/R_CC/result/Lahman/LahmanData.Rd_%03d_medium.png", width=480, height=480)
> ### Name: LahmanData
> ### Title: Lahman Datasets
> ### Aliases: LahmanData
> ### Keywords: datasets
>
> ### ** Examples
>
> data(LahmanData)
>
> # find ID variables in the datasets
> IDvars <- lapply(LahmanData[,"file"], function(x) grep('.*ID$', colnames(get(x)), value=TRUE))
> names(IDvars) <- LahmanData[,"file"]
> str(IDvars)
List of 24
$ AllstarFull : chr [1:5] "playerID" "yearID" "gameID" "teamID" ...
$ Appearances : chr [1:4] "yearID" "teamID" "lgID" "playerID"
$ AwardsManagers : chr [1:4] "playerID" "awardID" "yearID" "lgID"
$ AwardsPlayers : chr [1:4] "playerID" "awardID" "yearID" "lgID"
$ AwardsShareManagers: chr [1:4] "awardID" "yearID" "lgID" "playerID"
$ AwardsSharePlayers : chr [1:4] "awardID" "yearID" "lgID" "playerID"
$ Batting : chr [1:4] "playerID" "yearID" "teamID" "lgID"
$ BattingPost : chr [1:4] "yearID" "playerID" "teamID" "lgID"
$ CollegePlaying : chr [1:3] "playerID" "schoolID" "yearID"
$ Fielding : chr [1:4] "playerID" "yearID" "teamID" "lgID"
$ FieldingOF : chr [1:2] "playerID" "yearID"
$ FieldingPost : chr [1:4] "playerID" "yearID" "teamID" "lgID"
$ HallOfFame : chr [1:2] "playerID" "yearID"
$ Managers : chr [1:4] "playerID" "yearID" "teamID" "lgID"
$ ManagersHalf : chr [1:4] "playerID" "yearID" "teamID" "lgID"
$ Master : chr [1:3] "playerID" "retroID" "bbrefID"
$ Pitching : chr [1:4] "playerID" "yearID" "teamID" "lgID"
$ PitchingPost : chr [1:4] "playerID" "yearID" "teamID" "lgID"
$ Salaries : chr [1:4] "yearID" "teamID" "lgID" "playerID"
$ Schools : chr "schoolID"
$ SeriesPost : chr "yearID"
$ Teams : chr [1:5] "yearID" "lgID" "teamID" "franchID" ...
$ TeamsFranchises : chr "franchID"
$ TeamsHalf : chr [1:4] "yearID" "lgID" "teamID" "divID"
> # vector of unique ID variables
> unique(unlist(IDvars))
[1] "playerID" "yearID" "gameID" "teamID" "lgID" "awardID"
[7] "schoolID" "retroID" "bbrefID" "franchID" "divID"
>
> # which datasets have playerID?
> names(which(sapply(IDvars, function(x) "playerID" %in% x)))
[1] "AllstarFull" "Appearances" "AwardsManagers"
[4] "AwardsPlayers" "AwardsShareManagers" "AwardsSharePlayers"
[7] "Batting" "BattingPost" "CollegePlaying"
[10] "Fielding" "FieldingOF" "FieldingPost"
[13] "HallOfFame" "Managers" "ManagersHalf"
[16] "Master" "Pitching" "PitchingPost"
[19] "Salaries"
>
> ################################################
> # Visualize relations among datasets via an MDS
> ################################################
> # jaccard distance between two sets; assure positivity
> jaccard <- function(A, B) {
+ max(1 - length(intersect(A,B)) / length(union(A,B)), .00001)
+ }
>
> distmat <- function(vars, FUN=jaccard) {
+ nv <- length(vars)
+ d <- matrix(0, nv, nv, dimnames=list(names(vars), names(vars)))
+ for(i in 1:nv) {
+ for (j in 1:nv) {
+ if (i != j) d[i,j] <- FUN(vars[[i]], vars[[j]])
+ }
+ }
+ d
+ }
>
> # do an MDS on distances
> distID <- distmat(IDvars)
> config <- cmdscale(distID)
>
> pos=rep(1:4, length=nrow(config))
> plot(config[,1], config[,2], xlab = "", ylab = "", asp = 1, axes=FALSE,
+ main="MDS of ID variable distances of Lahman tables")
> abline(h=0, v=0, col="gray80")
> text(config[,1], config[,2], rownames(config), cex = 0.75, pos=pos, xpd=NA)
>
>
>
>
>
>
> dev.off()
null device
1
>
|
|