battingStats for calculating batting average (BA) and other derived statistics
baseball for a similar dataset, but a subset of players who played 15 or more seasons.
Baseball for data on batting in the 1987 season.
Examples
data(Batting)
head(Batting)
require('plyr')
# calculate batting average and other stats
batting <- battingStats()
# add salary to Batting data; need to match by player, year and team
batting <- merge(batting,
Salaries[,c("playerID", "yearID", "teamID", "salary")],
by=c("playerID", "yearID", "teamID"), all.x=TRUE)
# Add name, age and bat hand information:
masterInfo <- Master[, c('playerID', 'birthYear', 'birthMonth',
'nameLast', 'nameFirst', 'bats')]
batting <- merge(batting, masterInfo, all.x = TRUE)
batting$age <- with(batting, yearID - birthYear -
ifelse(birthMonth < 10, 0, 1))
batting <- arrange(batting, playerID, yearID, stint)
## Generate a ggplot similar to the NYT graph in the story about Ted
## Williams and the last .400 MLB season
# http://www.nytimes.com/interactive/2011/09/18/sports/baseball/WILLIAMS-GRAPHIC.html
# Restrict the pool of eligible players to the years after 1899 and
# players with a minimum of 450 plate appearances (this covers the
# strike year of 1994 when Tony Gwynn hit .394 before play was suspended
# for the season - in a normal year, the minimum number of plate appearances is 502)
eligibleHitters <- subset(batting, yearID >= 1900 & PA > 450)
# Find the hitters with the highest BA in MLB each year (there are a
# few ties). Include all players with BA > .400
topHitters <- ddply(eligibleHitters, .(yearID), subset, (BA == max(BA))|BA > .400)
# Create a factor variable to distinguish the .400 hitters
topHitters$ba400 <- with(topHitters, BA >= 0.400)
# Sub-data frame for the .400 hitters plus the outliers after 1950
# (averages above .380) - used to produce labels in the plot below
bignames <- rbind(subset(topHitters, ba400),
subset(topHitters, yearID > 1950 & BA > 0.380))
# Cut to the relevant set of variables
bignames <- subset(bignames, select = c('playerID', 'yearID', 'nameLast',
'nameFirst', 'BA'))
# Ditto for the original data frame
topHitters <- subset(topHitters, select = c('playerID', 'yearID', 'BA', 'ba400'))
# Positional offsets to spread out certain labels
# NL TC JJ TC GS TC RH GS HH RH RH BT TW TW RC GB TG
bignames$xoffset <- c(0, 0, 0, 0, 0, 0, 0, 0, -8, 0, 3, 3, 0, 0, -2, 0, 0)
bignames$yoffset <- c(0, 0, -0.003, 0, 0, 0, 0, 0, -0.004, 0, 0, 0, 0, 0, -0.003, 0, 0) + 0.002
require('ggplot2')
ggplot(topHitters, aes(x = yearID, y = BA)) +
geom_point(aes(colour = ba400), size = 2.5) +
geom_hline(yintercept = 0.400, size = 1) +
geom_text(data = bignames, aes(x = yearID + xoffset, y = BA + yoffset,
label = nameLast), size = 3) +
scale_colour_manual(values = c('FALSE' = 'black', 'TRUE' = 'red')) +
ylim(0.330, 0.430) +
xlab('Year') +
scale_y_continuous('Batting average',
breaks = seq(0.34, 0.42, by = 0.02),
labels = c('.340', '.360', '.380', '.400', '.420')) +
geom_smooth() +
theme(legend.position = 'none')
##########################################################
# after Chris Green,
# http://sabr.org/research/baseball-s-first-power-surge-home-runs-late-19th-century-major-leagues
# Total home runs by year
totalHR <- ddply(Batting, .(yearID), summarise,
HomeRuns = sum(as.numeric(HR), na.rm=TRUE),
Games = sum(as.numeric(G), na.rm=TRUE)
)
plot(HomeRuns ~ yearID, data=subset(totalHR, yearID<=1918))
# take games into account?
plot(HomeRuns/Games ~ yearID, data=subset(totalHR, yearID<=1918))
# long term trend?
plot(HomeRuns ~ yearID, data=totalHR)
plot(HomeRuns/Games ~ yearID, data=totalHR)
Results
R version 3.3.1 (2016-06-21) -- "Bug in Your Hair"
Copyright (C) 2016 The R Foundation for Statistical Computing
Platform: x86_64-pc-linux-gnu (64-bit)
R is free software and comes with ABSOLUTELY NO WARRANTY.
You are welcome to redistribute it under certain conditions.
Type 'license()' or 'licence()' for distribution details.
R is a collaborative project with many contributors.
Type 'contributors()' for more information and
'citation()' on how to cite R or R packages in publications.
Type 'demo()' for some demos, 'help()' for on-line help, or
'help.start()' for an HTML browser interface to help.
Type 'q()' to quit R.
> library(Lahman)
> png(filename="/home/ddbj/snapshot/RGM3/R_CC/result/Lahman/Batting.Rd_%03d_medium.png", width=480, height=480)
> ### Name: Batting
> ### Title: Batting table
> ### Aliases: Batting
> ### Keywords: datasets
>
> ### ** Examples
>
> data(Batting)
> head(Batting)
playerID yearID stint teamID lgID G AB R H X2B X3B HR RBI SB CS BB SO
1 abercda01 1871 1 TRO NA 1 4 0 0 0 0 0 0 0 0 0 0
2 addybo01 1871 1 RC1 NA 25 118 30 32 6 0 0 13 8 1 4 0
3 allisar01 1871 1 CL1 NA 29 137 28 40 4 5 0 19 3 1 2 5
4 allisdo01 1871 1 WS3 NA 27 133 28 44 10 2 2 27 1 1 0 2
5 ansonca01 1871 1 RC1 NA 25 120 29 39 11 3 0 16 6 2 2 1
6 armstbo01 1871 1 FW1 NA 12 49 9 11 2 1 0 5 0 1 0 1
IBB HBP SH SF GIDP
1 NA NA NA NA NA
2 NA NA NA NA NA
3 NA NA NA NA NA
4 NA NA NA NA NA
5 NA NA NA NA NA
6 NA NA NA NA NA
> require('plyr')
Loading required package: plyr
>
> # calculate batting average and other stats
> batting <- battingStats()
>
> # add salary to Batting data; need to match by player, year and team
> batting <- merge(batting,
+ Salaries[,c("playerID", "yearID", "teamID", "salary")],
+ by=c("playerID", "yearID", "teamID"), all.x=TRUE)
>
> # Add name, age and bat hand information:
> masterInfo <- Master[, c('playerID', 'birthYear', 'birthMonth',
+ 'nameLast', 'nameFirst', 'bats')]
> batting <- merge(batting, masterInfo, all.x = TRUE)
> batting$age <- with(batting, yearID - birthYear -
+ ifelse(birthMonth < 10, 0, 1))
>
> batting <- arrange(batting, playerID, yearID, stint)
>
> ## Generate a ggplot similar to the NYT graph in the story about Ted
> ## Williams and the last .400 MLB season
> # http://www.nytimes.com/interactive/2011/09/18/sports/baseball/WILLIAMS-GRAPHIC.html
>
> # Restrict the pool of eligible players to the years after 1899 and
> # players with a minimum of 450 plate appearances (this covers the
> # strike year of 1994 when Tony Gwynn hit .394 before play was suspended
> # for the season - in a normal year, the minimum number of plate appearances is 502)
> eligibleHitters <- subset(batting, yearID >= 1900 & PA > 450)
>
> # Find the hitters with the highest BA in MLB each year (there are a
> # few ties). Include all players with BA > .400
> topHitters <- ddply(eligibleHitters, .(yearID), subset, (BA == max(BA))|BA > .400)
>
> # Create a factor variable to distinguish the .400 hitters
> topHitters$ba400 <- with(topHitters, BA >= 0.400)
>
> # Sub-data frame for the .400 hitters plus the outliers after 1950
> # (averages above .380) - used to produce labels in the plot below
> bignames <- rbind(subset(topHitters, ba400),
+ subset(topHitters, yearID > 1950 & BA > 0.380))
> # Cut to the relevant set of variables
> bignames <- subset(bignames, select = c('playerID', 'yearID', 'nameLast',
+ 'nameFirst', 'BA'))
>
> # Ditto for the original data frame
> topHitters <- subset(topHitters, select = c('playerID', 'yearID', 'BA', 'ba400'))
>
> # Positional offsets to spread out certain labels
> # NL TC JJ TC GS TC RH GS HH RH RH BT TW TW RC GB TG
> bignames$xoffset <- c(0, 0, 0, 0, 0, 0, 0, 0, -8, 0, 3, 3, 0, 0, -2, 0, 0)
> bignames$yoffset <- c(0, 0, -0.003, 0, 0, 0, 0, 0, -0.004, 0, 0, 0, 0, 0, -0.003, 0, 0) + 0.002
>
> require('ggplot2')
Loading required package: ggplot2
> ggplot(topHitters, aes(x = yearID, y = BA)) +
+ geom_point(aes(colour = ba400), size = 2.5) +
+ geom_hline(yintercept = 0.400, size = 1) +
+ geom_text(data = bignames, aes(x = yearID + xoffset, y = BA + yoffset,
+ label = nameLast), size = 3) +
+ scale_colour_manual(values = c('FALSE' = 'black', 'TRUE' = 'red')) +
+ ylim(0.330, 0.430) +
+ xlab('Year') +
+ scale_y_continuous('Batting average',
+ breaks = seq(0.34, 0.42, by = 0.02),
+ labels = c('.340', '.360', '.380', '.400', '.420')) +
+ geom_smooth() +
+ theme(legend.position = 'none')
Scale for 'y' is already present. Adding another scale for 'y', which will
replace the existing scale.
>
> ##########################################################
> # after Chris Green,
> # http://sabr.org/research/baseball-s-first-power-surge-home-runs-late-19th-century-major-leagues
>
> # Total home runs by year
> totalHR <- ddply(Batting, .(yearID), summarise,
+ HomeRuns = sum(as.numeric(HR), na.rm=TRUE),
+ Games = sum(as.numeric(G), na.rm=TRUE)
+ )
>
> plot(HomeRuns ~ yearID, data=subset(totalHR, yearID<=1918))
> # take games into account?
> plot(HomeRuns/Games ~ yearID, data=subset(totalHR, yearID<=1918))
>
> # long term trend?
> plot(HomeRuns ~ yearID, data=totalHR)
> plot(HomeRuns/Games ~ yearID, data=totalHR)
>
>
>
>
>
>
>
> dev.off()
null device
1
>