Data cleaning and graphing code
🧩 Syntax:
import matplotlib.pyplot as plt
from matplotlib.pyplot import figure
import json
import pandas as pd
import numpy as np
import scipy.stats as sp
with open("file.json", 'r') as f:
LeagueMasterList = json.load(f)
def double_std(array):
return np.std(array) * 2
for league in LeagueMasterList:
# LeagueList = next((item for item in LeagueMasterList if item['Name'] == "Premier League"), None)
df = pd.DataFrame(league['Data'])
df['SquadCount'] = df['SquadCount'].astype('int')
#Filter 0s and blanks and low squad count (high-error data) from lists
df = df[df['AverageAgeSquad'] != '']
df = df[df['SquadCount'] > 10]
df['SquadCount'] = df['SquadCount'].astype('int')
df['PlayersUsed'] = df['PlayersUsed'].astype('int')
df['AverageAgeSquad'] = df['AverageAgeSquad'].astype('float')
#Trim out bad startingXI data
df = df[df['AverageAgeSquad'] > 0]
df = df[df['AverageAgeSquad'] < 50]
df['AverageAgeStartingXI'] = df['AverageAgeStartingXI'].astype('float')
#Trim out bad startingXI data
df = df[df['AverageAgeStartingXI'] > 0]
df = df[df['AverageAgeStartingXI'] < 50]
df['AverageAge'] = df['AverageAge'].astype('float')
df['SquadCount'] = df['SquadCount'].astype('float')
df['SeasonStartYear'] = df['SeasonStartYear'].astype('float')
#Filter years where there are less than 9 clubs with data remaining
dfYear = df.SeasonStartYear.value_counts()
for idx, year in dfYear.items():
if year < 5:
df.drop(df[df.SeasonStartYear == idx].index, inplace=True)
dfStartingXI = df[df['AverageAgeStartingXI'] > 0]
dfStartingXI.plot.scatter('SeasonStartYear','AverageAgeStartingXI', color='DarkGreen', ax=ax, alpha=0.3, label='AvgAgeStartingXI')
dfBins0 = df[['SeasonStartYear','AverageAgeSquad']].copy()
dfBins0.rename(columns={'AverageAgeSquad': 'AvgAge'}, inplace=True)
dfBins1 = df[['SeasonStartYear','AverageAgeStartingXI']].copy()
dfBins1.rename(columns={'AverageAgeStartingXI': 'AvgAge'}, inplace=True)
dfSquad = dfBins0.groupby("SeasonStartYear", as_index=False).agg([np.mean, double_std, sp.sem])
dfSquad = dfSquad["AvgAge"]
dfSquad = dfSquad.reset_index()
dfXI = dfBins1.groupby("SeasonStartYear", as_index=False).agg([np.mean, double_std, sp.sem])
dfXI = dfXI["AvgAge"]
dfXI = dfXI.reset_index()
#Add 0.5 so easier to display side by side with squad avg age in graph
dfXI['SeasonStartYear'] = dfXI['SeasonStartYear'] + 0.5
ax2 = figure(figsize=(20, 20), dpi=380)
ax2 = plt.subplot(211)
ax2.set_xlim(left=1925,right=2025)
ax2.errorbar(dfSquad['SeasonStartYear'], dfSquad['mean'], color='tab:orange', fmt='o', label='AvgAgeSquad', yerr=dfSquad['double_std'], ecolor='tab:orange', capsize=4, elinewidth=3)
ax2.errorbar(dfXI['SeasonStartYear'], dfXI['mean'], color='tab:blue', fmt='o', label='AvgAgeStartingXI', yerr=dfXI['double_std'], ecolor='tab:blue', capsize=4, elinewidth=3)
plt.legend(loc='upper left')
plt.title(league['Name'])
plt.grid(True)