-
Notifications
You must be signed in to change notification settings - Fork 0
/
Computation.Rmd
244 lines (222 loc) · 12.7 KB
/
Computation.Rmd
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
---
title: "Computation"
author: "Rob Weber"
date: "July 24, 2018"
output: html_document
---
```{r setup, echo = FALSE}
knitr::opts_chunk$set(echo = TRUE)
```
```{r Library}
library(tidyverse)
library(ggplot2)
library(baseballr)
```
# Data
The in-file data-set names are as follows: "wp.df" is the full (with pitchers) fangraphs.com batting data set, "sc.data" is the 2017 statcast data, "team_stats" is the 2017 team data, "id" is the mlb id data set, "defense" is the fangraphs.com data set with the fielding data, and "pf" is the data set with the fangraphs.com basic park factors by team.
# Cleaning
Some cleaing is necessary before the process begins. First, we need the positions of all the players in the data set because fangraphs.com doesn't let positions be in a dataset with offensive stats for some reason. That is what the "defense" data is used for. Then, since this is WAR for position players, fangraphs.com eliminates any pitchers from the data set in their calculations, but I keep the other data set just in case. Lastly, some of the names don't match up with the names in the data set being used to match names to their respective player i.d.s in the statcast data set.
```{r Cleaning}
# first, this narrows down "defense" to just the player name, team, position,
## innings played at said position (it is needed for the positional adjustment later)
defense[ , c("Name", "Team", "Position", "Inn")] -> defense
# this next step switches the team abbreviations in "defense" to the team names like in "wp.df"
defense <- defense %>%
mutate(Team = ifelse(Team == "ARI", "Diamondbacks",
ifelse(Team == "ATL", "Braves",
ifelse(Team == "BAL", "Orioles",
ifelse(Team == "BOS", "Red Sox",
ifelse(Team == "CHC", "Cubs",
ifelse(Team == "CIN", "Reds",
ifelse(Team == "CLE", "Indians",
ifelse(Team == "COL", "Rockies",
ifelse(Team == "CHW", "White Sox",
ifelse(Team == "DET", "Tigers",
ifelse(Team == "HOU", "Astros",
ifelse(Team == "KCR", "Royals",
ifelse(Team == "LAA", "Angels",
ifelse(Team == "LAD", "Dodgers",
ifelse(Team == "MIA", "Marlins",
ifelse(Team == "MIL", "Brewers",
ifelse(Team == "MIN", "Twins",
ifelse(Team == "NYM", "Mets",
ifelse(Team == "NYY", "Yankees",
ifelse(Team == "OAK", "Athletics",
ifelse(Team == "PHI", "Phillies",
ifelse(Team == "PIT", "Pirates",
ifelse(Team == "SDP", "Padres",
ifelse(Team == "SEA", "Mariners",
ifelse(Team == "SFG", "Giants",
ifelse(Team == "STL", "Cardinals",
ifelse(Team == "TBR", "Rays",
ifelse(Team == "TEX", "Rangers",
ifelse(Team == "TOR", "Blue Jays",
ifelse(Team == "WSN", "Nationals",
"null")))))))))))))))))))))))))))))))
as.character(wp.df$Name) -> wp.df$Name; as.character(defense$Name) -> defense$Name # just makes sure the name columns are treated as characters
# this is to solve a headache produced by the fact that a player is in "defense"
## with data from every time they played a different position. What fangraphs does in terms of the
## positional adjustment for this is not stated. So, I just take the position they played the most innings and
## total the number of innings played at all positions.
defense <- defense %>%
group_by(Name, Team) %>%
summarise(Position = Position[Inn == max(Inn)],
Inn = sum(Inn))
left_join(wp.df, defense, by = c("Name", "Team")) -> wp.df # just gets the dfs together
# this step gets rid of the pitchers from the data set and creates a new df calle "full.df"
## It also gets rid of the few players without any registered plate appearances.
## This is what will be used from now on
full.df <- wp.df %>%
filter(Position != "P" & PA != 0)
str_replace_all(full.df$Name, "Nick Delmonico", "Nicky Delmonico") -> full.df$Name
str_replace_all(full.df$Name, "Nicholas Castellanos", "Nick Castellanos") -> full.df$Name
str_replace_all(full.df$Name, "Eric Young", "Eric Young Jr.") -> full.df$Name
str_replace_all(full.df$Name, "Raffy Lopez", "Rafael Lopez") -> full.df$Name
str_replace_all(full.df$Name, "Yolmer Sanchez", "Carlos Sanchez") -> full.df$Name
str_replace_all(full.df$Name, "Rickie Weeks Jr.", "Rickie Weeks") -> full.df$Name
str_replace_all(full.df$Name, "Zack Granite", "Zach Granite") -> full.df$Name
str_replace_all(full.df$Name, "Hyun Soo Kim", "Hyun-soo Kim") -> full.df$Name
str_replace_all(full.df$Name, "Rey Fuentes", "Reymond Fuentes") -> full.df$Name
str_replace_all(full.df$Name, "Vince Velasquez", "Vincent Velasquez") -> full.df$Name
str_replace_all(full.df$Name, "Cam Perkins", "Cameron Perkins") -> full.df$Name
str_replace_all(full.df$Name, "Adalberto Mondesi", "Raul Mondesi") -> full.df$Name
str_replace_all(full.df$Name, "Luke Sims", "Lucas Sims") -> full.df$Name
str_replace_all(full.df$Name, "Rubby de la Rosa", "Rubby De La Rosa") -> full.df$Name
str_replace_all(full.df$Name, "AJ Ramos", "A.J. Ramos") -> full.df$Name
str_replace_all(full.df$Name, "T.J. House", "TJ House") -> full.df$Name
str_replace_all(full.df$Name, "Felipe Vasquez", "Felipe Rivero") -> full.df$Name
str_replace_all(full.df$Name, "Dan Winkler", "Daniel Winkler") -> full.df$Name
str_replace_all(full.df$Name, "Jakob Junis", "Jake Junis") -> full.df$Name
str_replace_all(full.df$Name, "Jake Faria", "Jacob Faria") -> full.df$Name
str_replace_all(full.df$Name, "Matt Strahm", "Matthew Strahm") -> full.df$Name
str_replace_all(full.df$Name, "Lance McCullers Jr.", "Lance McCullers") -> full.df$Name
id[-c(which(id$mlb_id == 446345)), ] -> id
```
# Batting Runs
- Bat = wRAA + (lgR/PA - (PF*lgR/PA))*PA + (lgR/PA - (AL or NL non-pitcher wRC/PA))*PA
- wRAA = ((wOBA - lgwOBA)/wOBA Scale) * PA
- wOBA = (weight*uBB + weight*HBP + weight*1B + weight*2B + weight*3B + weight*HR) / (AB + uBB + SF + HBP)
```{r Batting Runs}
# wOBA first
# This uses baseballr functions to get the linear weights and then stores it
run_expectancy_code(sc.data) -> weights
linear_weights_savant(weights) -> weights
# want to rename the fangraphs wOBA so it can be differentiated later
full.df <- full.df %>%
rename(wOBA_fg = wOBA) %>%
mutate(uBB = BB - IBB) # get unintentional walks too
# this gets wOBA not scaled to OBP
(((weights$linear_weights_above_outs[weights$events == "walk"] * full.df$uBB) +
(weights$linear_weights_above_outs[weights$events == "hit_by_pitch"] * full.df$HBP) +
(weights$linear_weights_above_outs[weights$events == "single"] * full.df$X1B) +
(weights$linear_weights_above_outs[weights$events == "double"] * full.df$X2B) +
(weights$linear_weights_above_outs[weights$events == "triple"] * full.df$X3B) +
(weights$linear_weights_above_outs[weights$events == "home_run"] * full.df$HR))
/ (full.df$AB + full.df$uBB + full.df$SF + full.df$HBP)) -> full.df$wOBA_unscaled
# now to get the weights scaled to OBP
# need the mean of both OBP and unscaled wOBA
mean(full.df$OBP, na.rm = T) -> obp_mean
mean(full.df$wOBA_unscaled, na.rm = T) -> woba_unscaled_mean
# this gets the wOBA scale
obp_mean / woba_unscaled_mean -> woba_scale
# gets the scaled weights
weights$linear_weights_above_outs * woba_scale -> weights$linear_weights_scaled
# get scaled wOBA
(((weights$linear_weights_scaled[weights$events == "walk"] * full.df$uBB) +
(weights$linear_weights_scaled[weights$events == "hit_by_pitch"] * full.df$HBP) +
(weights$linear_weights_scaled[weights$events == "single"] * full.df$X1B) +
(weights$linear_weights_scaled[weights$events == "double"] * full.df$X2B) +
(weights$linear_weights_scaled[weights$events == "triple"] * full.df$X3B) +
(weights$linear_weights_scaled[weights$events == "home_run"] * full.df$HR))
/ (full.df$AB + full.df$uBB + full.df$SF + full.df$HBP)) -> full.df$wOBA_x
# wRAA next
((full.df$wOBA_x - mean(full.df$wOBA_x, na.rm = T)) / woba_scale) * full.df$PA -> full.df$wRAA_x
# now batting runs
# league runs per plate appearance
lgR_PA <- sum(full.df$R) / sum(full.df$PA)
# AL or NL non-pitcher wRC/PA: wRC = (((wOBA-League wOBA)/wOBA Scale)+(League R/PA))*PA
# need the league (AL or NL) for each player first
ifelse(full.df$Team %in% c("Yankees", "Red Sox", "Rays", "Orioles", "Blue Jays", "White Sox", "Indians", "Royals", "Tigers", "Twins",
"Mariners", "Angels", "Rangers", "Athletics", "Astros"),
"AL", "NL") -> full.df$League
## AL first
full.df_AL <- full.df[full.df$League == "AL", ]
((((full.df_AL$wOBA_x - mean(full.df$wOBA_x)) / woba_scale) + (lgR_PA)) * full.df_AL$PA) -> wRC.PA_AL # get wRC for each AL player first
sum(wRC.PA_AL, na.rm = T) / sum(full.df_AL$PA, na.rm = T) -> wRC.PA_AL # then get the AL wRC per PA
## then the same for the NL
full.df_NL <- full.df[full.df$League == "NL", ]
((((full.df_NL$wOBA_x - mean(full.df$wOBA_x)) / woba_scale) + (lgR_PA)) * full.df_NL$PA) -> wRC.PA_NL
sum(wRC.PA_NL, na.rm = T) / sum(full.df_NL$PA, na.rm = T) -> wRC.PA_NL
# need the park factor in the data set too
pf[ , c("Team", "Basic")] -> pf
c("Team", "PF") -> colnames(pf) # get everything named nice and neat
pf$PF / 100 -> pf$PF # the WAR calculation uses the park factor divided by 100
left_join(full.df, pf, by = "Team") -> full.df # gets the park factors into the full df
# this is that actual batting runs calculation
ifelse(full.df$League == "AL",
(full.df$wRAA_x + ((lgR_PA - (full.df$PF * lgR_PA)) * full.df$PA) + ((lgR_PA - wRC.PA_AL) * full.df$PA)),
(full.df$wRAA_x + ((lgR_PA - (full.df$PF * lgR_PA)) * full.df$PA) + ((lgR_PA - wRC.PA_NL) * full.df$PA))) -> full.df$Bat_x
```
# Baserunning Runs
- BR = UBR + wSB + wGDP
```{r Baserunning Runs}
# wSB was the only one that I was able to calculate on my own (see README)
# wSB = SB * runSB + CS * runCS - lgwSB * (1B + uBB + HBP)
## runSB is always 0.2
2 -> runSB
## runCS = 2 x RunsPerOut + 0.075
### runs per out = (season total runs) / (season total outs)
runCS <- -(2 * (sum(team_stats$R) / (81 * 30 * 54)) + 0.075)
## lgwSB = (SB * runSB + CS * runCS) / (1B + uBB + HBP)
lgwSB <- ((sum(full.df$SB) * 0.2) + (sum(full.df$CS) * runCS)) / (sum(full.df$X1B) + sum(full.df$uBB) + sum(full.df$HBP))
full.df$wSB_x <- (full.df$SB * runSB) + (full.df$CS * runCS) - (lgwSB * (full.df$X1B + full.df$uBB + full.df$HBP))
# so BR runs is just adding up the pieces
full.df$BR <- full.df$wSB_x + full.df$UBR + full.df$wGDP
```
# Fielding Runs
For fielding runs, fangraphs.com uses their UZR number, which I am not able to calculate (see README), for non-catchers and the sum of rSB and RPP for catchers, which I am also unable to calculate.
# Positional Adjustment
- Pos = ((Inn/9) / 162) * position specific run value
```{r Positional Adjustment}
# the first step is to get a df of the positions and their respective adjustments taken from fangraphs
pos_adj <- data.frame("Position" = c("C", "1B", "2B", "3B", "SS", "LF", "CF", "RF", "DH"),
"Positional_Adjustment" = c(12.5, -12.5, 2.5, 2.5, 7.5, -7.5, 2.5, -7.5, -17.5))
left_join(full.df, pos_adj, by = "Position") -> full.df # this gets the uncalculated positional adjustment next the the player at that position
# this next step just performs the fagnraphs calculation
full.df <- full.df %>%
mutate(Positional_Adjustment = (((Inn / 9)) / 162) * Positional_Adjustment)
```
# League Adjustment
- ((-1)*(lgBatting Runs + lgBase Running Runs + lgFielding Runs + lgPositional Adjustment) / lgPA)*PA
```{r League Adjustment}
# this finds the base adjustment for both leagues
lg_adj <- full.df %>%
group_by(League) %>%
summarise(League_Adjustment = ((-1)*(sum(Bat, na.rm = T) + sum(BR, na.rm = T) + sum(Fld, na.rm = T) + sum(Positional_Adjustment, na.rm = T))
/ sum(PA, na.rm = T)))
# gets the base league adjustments into the df next the player in the respective league
left_join(full.df, lg_adj, by = "League") -> full.df
# then that number just needs to multiplied by the player's plate appearances
full.df <- full.df %>%
mutate(League_Adjustment = League_Adjustment * PA)
```
# Runs per Win
fangraphs.com uses 10.048 as RPW for 2017
```{r RPW}
RPW = 10.048
```
# Replacement Runs
- Rep = 570 * (RPW/lgPA) * PA
```{r Replacement Runs}
# gets the total number of plate appearances by all non-pitchers
sum(wp.df$PA) -> lgPA
# gets replacement runs
full.df <- full.df %>%
mutate(Rep_x = 570 * (RPW / lgPA) * PA)
```
# WAR
- WAR = (Bat + BR + Fld + Positional_Adjustment + League_Adjustment + Rep) / (RPW)
```{r WAR}
full.df <- full.df %>%
mutate(WAR_x = (Bat_x + BR + Fld + Positional_Adjustment + League_Adjustment + Rep_x) / RPW)
```