Setup and Data¶

In [1]:
import pandas as pd

awards = pd.read_csv(".../awards_data.csv")
player_data = pd.read_csv(".../player_stats.csv")
team_data = pd.read_csv(".../team_stats.csv")
rebounding_data = pd.read_csv("...team_rebounding_data_22.csv")
In [2]:
awards
Out[2]:
season nbapersonid All NBA Defensive First Team All NBA Defensive Second Team All NBA First Team All NBA Second Team All NBA Third Team All Rookie First Team All Rookie Second Team Bill Russell NBA Finals MVP ... all_star_game rookie_all_star_game allstar_rk Defensive Player Of The Year_rk Most Improved Player_rk Most Valuable Player_rk Rookie Of The Year_rk Sixth Man Of The Year_rk all_nba_points_rk all_rookie_points_rk
0 2007 708.0 1.0 0.0 1.0 0.0 0.0 0.0 0.0 0.0 ... True False 1.0 1.0 NaN 3.0 NaN NaN NaN NaN
1 2007 947.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 ... True False 2.0 NaN NaN NaN NaN NaN NaN NaN
2 2007 948.0 1.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 ... NaN NaN 3.0 2.0 NaN NaN NaN NaN NaN NaN
3 2007 959.0 0.0 0.0 0.0 1.0 0.0 0.0 0.0 0.0 ... True False 4.0 NaN NaN 9.0 NaN NaN NaN NaN
4 2007 977.0 1.0 0.0 1.0 0.0 0.0 0.0 0.0 0.0 ... True False 1.0 5.0 NaN 1.0 NaN NaN NaN NaN
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
4324 2015 1626170.0 NaN NaN NaN NaN NaN NaN NaN NaN ... NaN NaN NaN NaN NaN NaN NaN NaN NaN 24.0
4325 2015 1626202.0 NaN NaN NaN NaN NaN NaN NaN NaN ... NaN NaN NaN NaN NaN NaN NaN NaN NaN 24.0
4326 2015 1626273.0 NaN NaN NaN NaN NaN NaN NaN NaN ... NaN NaN NaN NaN NaN NaN NaN NaN NaN 24.0
4327 2018 1628971.0 NaN NaN NaN NaN NaN NaN NaN NaN ... NaN NaN NaN NaN NaN NaN NaN NaN NaN 18.0
4328 2020 1630214.0 NaN NaN NaN NaN NaN NaN NaN NaN ... NaN NaN NaN NaN NaN NaN NaN NaN NaN 18.0

4329 rows × 23 columns

In [3]:
player_data
Out[3]:
nbapersonid player draftyear draftpick season nbateamid team games games_start mins ... blk_pct tov_pct usg OWS DWS WS OBPM DBPM BPM VORP
0 2585 Zaza Pachulia 2003 42.0 2007 1610612737 ATL 62 5 944 ... 0.010 0.181 0.183 0.2 0.9 1.1 -3.9 -1.3 -5.1 -0.7
1 200780 Solomon Jones 2006 33.0 2007 1610612737 ATL 35 0 145 ... 0.026 0.221 0.156 -0.1 0.1 0.0 -6.7 -2.0 -8.8 -0.2
2 2746 Josh Smith 2004 17.0 2007 1610612737 ATL 81 81 2873 ... 0.059 0.155 0.250 1.2 4.6 5.8 0.5 2.5 3.0 3.7
3 201151 Acie Law 2007 11.0 2007 1610612737 ATL 56 6 865 ... 0.000 0.178 0.165 -0.5 0.4 -0.1 -4.2 -1.0 -5.2 -0.7
4 101136 Salim Stoudamire 2005 31.0 2007 1610612737 ATL 35 0 402 ... 0.009 0.094 0.252 0.1 0.1 0.3 -1.0 -2.5 -3.5 -0.1
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
8487 1630648 Jordan Schakel 2021 NaN 2021 1610612764 WAS 4 0 30 ... 0.000 0.078 0.191 -0.2 0.0 -0.1 -8.6 -4.4 -13.0 -0.1
8488 1630557 Corey Kispert 2021 15.0 2021 1610612764 WAS 77 36 1801 ... 0.010 0.085 0.146 1.6 0.7 2.3 -0.8 -1.5 -2.3 -0.1
8489 1628398 Kyle Kuzma 2017 27.0 2021 1610612764 WAS 66 66 2204 ... 0.022 0.141 0.242 0.0 2.0 2.0 0.2 -0.4 -0.2 1.0
8490 203526 Raul Neto 2013 47.0 2021 1610612764 WAS 70 19 1372 ... 0.002 0.139 0.184 0.7 0.8 1.5 -2.5 -0.5 -3.0 -0.4
8491 1628418 Thomas Bryant 2017 42.0 2021 1610612764 WAS 27 9 439 ... 0.041 0.103 0.187 0.7 0.4 1.1 -0.4 -0.7 -1.0 0.1

8492 rows × 49 columns

In [4]:
team_data
Out[4]:
nbateamid team season games off_rtg def_rtg net_rtg W L
0 1610612737 ATL 2007 82 106.9 108.9 -2.0 37 45
1 1610612751 BKN 2007 82 104.0 109.4 -5.4 34 48
2 1610612738 BOS 2007 82 110.2 98.9 11.3 66 16
3 1610612766 CHA 2007 82 104.6 109.4 -4.8 32 50
4 1610612741 CHI 2007 82 103.9 107.2 -3.3 33 49
... ... ... ... ... ... ... ... ... ...
445 1610612758 SAC 2021 82 109.9 115.3 -5.4 30 52
446 1610612759 SAS 2021 82 112.4 112.3 0.1 34 48
447 1610612761 TOR 2021 82 112.9 110.5 2.4 48 34
448 1610612762 UTA 2021 82 116.7 110.5 6.2 49 33
449 1610612764 WAS 2021 82 111.1 114.5 -3.4 35 47

450 rows × 9 columns

In [5]:
rebounding_data
Out[5]:
team opp_team gamedate game_number offensive_rebounds off_rebound_chances oreb_pct
0 BOS PHI 2022-10-18 1 10 39 0.256410
1 PHI BOS 2022-10-18 1 8 42 0.190476
2 GSW LAL 2022-10-18 1 16 57 0.280702
3 LAL GSW 2022-10-18 1 14 57 0.245614
4 ORL DET 2022-10-19 1 13 47 0.276596
... ... ... ... ... ... ... ...
2455 LAC PHX 2023-04-09 82 18 56 0.321429
2456 MEM OKC 2023-04-09 82 12 55 0.218182
2457 POR GSW 2023-04-09 82 11 61 0.180328
2458 SAC DEN 2023-04-09 82 12 50 0.240000
2459 MIN NOP 2023-04-09 82 11 49 0.224490

2460 rows × 7 columns

What is the average number of points per game for players in the 2007-2021 seasons who won All NBA First, Second, and Third teams (not the All Defensive Teams), as well as for players who were in the All-Star Game (not the rookie all-star game)?

In [6]:
playerstats = awards.merge(player_data, on='nbapersonid', how='inner')
In [7]:
first_team = playerstats[(playerstats['season_x'] >= 2007) & (playerstats['season_x'] <= 2021) & 
                         (playerstats['All NBA First Team'] == 1.0)]

first_team_ppg = first_team['points'] / first_team['games']

print("First Team points per game:",first_team_ppg.mean())



sec_team = playerstats[(playerstats['season_x'] >= 2007) & (playerstats['season_x'] <= 2021) & 
                         (playerstats['All NBA Second Team'] == 1.0)]

sec_team_ppg = sec_team['points'] / sec_team['games']

print("Second Team points per game:",sec_team_ppg.mean())



third_team = playerstats[(playerstats['season_x'] >= 2007) & (playerstats['season_x'] <= 2021) & 
                         (playerstats['All NBA Third Team'] == 1.0)]

third_team_ppg = third_team['points'] / third_team['games']

print("Third Team points per game:",third_team_ppg.mean())



allstar_team = playerstats[(playerstats['season_x'] >= 2007) & (playerstats['season_x'] <= 2021) & 
                         (playerstats['all_star_game'])]

allstar_team_ppg = allstar_team['points'] / allstar_team['games']

print("All-Star Team points per game:",allstar_team_ppg.mean())
First Team points per game: 22.3096871795066
Second Team points per game: 19.602453597125052
Third Team points per game: 17.406737392538716
All-Star Team points per game: 18.73400715244671

What was the average number of years of experience in the league it takes for players to make their first All NBA Selection (1st, 2nd, or 3rd team)? Please limit your sample to players drafted in 2007 or later who did eventually go on to win at least one All NBA selection.

In [8]:
allnba_players = playerstats[(playerstats['draftyear'] >= 2007) &
                                ((playerstats['All NBA First Team'] == 1.0) |
                                 (playerstats['All NBA Second Team'] == 1.0) |
                                 (playerstats['All NBA Third Team'] == 1.0))]

allnba_columns = ['All NBA First Team', 'All NBA Second Team', 'All NBA Third Team']
allnba_players = allnba_players[allnba_players[allnba_columns].sum(axis=1) > 0]


first_allnba_years = allnba_players.groupby('nbapersonid')['season_x'].min()

years_to_first_allnba = first_allnba_years - allnba_players.groupby('nbapersonid')['draftyear'].min() + 1

print("Average years to first All NBA selection:", years_to_first_allnba.mean())
Average years to first All NBA selection: 4.682926829268292

Data Cleaning Interlude¶

You're going to work to create a dataset with a "career outcome" for each player, representing the highest level of success that the player achieved for at least two seasons after his first four seasons in the league. On a single season level, the outcomes are:

  • Elite: A player is "Elite" in a season if he won any All NBA award (1st, 2nd, or 3rd team), MVP, or DPOY in that season.
  • All-Star: A player is "All-Star" in a season if he was selected to be an All-Star that season.
  • Starter: A player is a "Starter" in a season if he started in at least 41 games in the season OR if he played at least 2000 minutes in the season.
  • Rotation: A player is a "Rotation" player in a season if he played at least 1000 minutes in the season.
  • Roster: A player is a "Roster" player in a season if he played at least 1 minute for an NBA team but did not meet any of the above criteria.
  • Out of the League: A player is "Out of the League" if he is not in the NBA in that season.
In [9]:
playerinfo = player_data.merge(awards, on='nbapersonid', how='left')
players_2010_draft = playerinfo[playerinfo['draftyear'] == 2010]
seasongames = team_data.merge(players_2010_draft, on='nbateamid', how='right')

def calculate_career_outcome(player_df):
    elite_count = 0
    all_star_count = 0
    starter_count = 0
    rotation_count = 0
    roster_count = 0
    out_of_league_count = 0
    
    for index, season in player_df.iterrows():
        if index >= 4:

            adjusted_minutes = season['mins']
            adjusted_games_started = season['games']

            if (season['All NBA First Team'] > 0) or (season['All NBA Second Team'] > 0) or (season['All NBA Third Team'] > 0):
                elite_count += 1
            elif season['all_star_game'] == True:
                all_star_count += 1
            elif (adjusted_games_started >= 41) or (adjusted_minutes >= 2000):
                starter_count += 1
            elif adjusted_minutes >= 1000 and adjusted_minutes < 2000:
                rotation_count += 1
            elif adjusted_minutes >= 1:
                roster_count += 1
    
    if elite_count >= 2:
        return "Elite"
    elif all_star_count >= 2:
        return "All-Star"
    elif starter_count >= 2:
        return "Starter"
    elif rotation_count >= 2:
        return "Rotation"
    elif roster_count >= 2:
        return "Roster"
    else:
        return "Out of the League"


results_list = []


for player, data in players_2010_draft.groupby('player'):
    career_outcome = calculate_career_outcome(data)
    results_list.append({'Player': player, 'Career Outcome': career_outcome})

results = pd.DataFrame(results_list)

outcome_counts = results['Career Outcome'].value_counts()

print(outcome_counts)
Starter              31
Roster               20
Out of the League    18
Elite                 3
All-Star              1
Name: Career Outcome, dtype: int64

Open Ended Modeling¶

Making a prediction on which players drafted in 2018 or later, will make the All-Star Game, using the data of players drafted on or before 2015 to train the model

In [10]:
import pandas as pd
import numpy as np
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report

training_data = playerstats[playerstats['draftyear'] <= 2015].copy()
training_data['points_per_game'] = training_data['points'] / training_data['games']

X = training_data[['points', 'ast', 'steals', 'blocks', 'tot_reb', 'PER', 'points_per_game']]
y = training_data['all_star_game'].copy()  # 1 if a player becomes an All-Star, 0 otherwise
y.fillna(False, inplace=True)

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

model = LogisticRegression(max_iter=1000, random_state=42)
model.fit(X_train, y_train)

y_pred = model.predict(X_test)

accuracy = accuracy_score(y_test, y_pred)
report = classification_report(y_test, y_pred)

print(f'Accuracy: {accuracy}')
print('Classification Report:')
print(report)

newplayers = playerstats[playerstats['draftyear'] >= 2018].copy()
newplayers['points_per_game'] = newplayers['points'] / newplayers['games']
new_players_data = newplayers[['points', 'ast', 'steals', 'blocks', 'tot_reb', 'PER', 'points_per_game']]
predictions = model.predict(new_players_data)

newplayers['predicted_all_star'] = predictions

all_star_players = newplayers[newplayers['predicted_all_star'] == True]

grouped_players = all_star_players.groupby('player')['predicted_all_star'].max()


print(grouped_players)
Accuracy: 0.8790214477211796
Classification Report:
              precision    recall  f1-score   support

       False       0.89      0.98      0.93      5102
        True       0.70      0.30      0.41       866

    accuracy                           0.88      5968
   macro avg       0.79      0.64      0.67      5968
weighted avg       0.86      0.88      0.86      5968

player
Ja Morant                  True
Luka Doncic                True
Luka Dončić                True
Shai Gilgeous-Alexander    True
Trae Young                 True
Zion Williamson            True
Name: predicted_all_star, dtype: bool

Predicting Team Stats¶

Calculate what OKC's predicted offensive rebound percent is for game 81 in the data. That is, use games 1-80 to predict game 81.

In [11]:
okc_data = rebounding_data[rebounding_data['team'] == 'OKC']

average_offensive_rebound_percent = okc_data.iloc[:80]['offensive_rebounds'].sum() / okc_data.iloc[:80]['off_rebound_chances'].sum()

predicted_offensive_rebound_percent = average_offensive_rebound_percent * 100

print("Predicted Offensive Rebound Percentage for Game 81:", predicted_offensive_rebound_percent)
Predicted Offensive Rebound Percentage for Game 81: 28.8689755388714

Visualizing Data¶

In [12]:
player_stats = pd.read_csv("C:/Users/shari/OneDrive/Desktop/Job Folder/OKC Project/Datasets/2021playerstats.csv", sep=';', encoding='latin1')

player_stats
Out[12]:
Rk Player Pos Age Tm G GS MP FG FGA ... FT% ORB DRB TRB AST STL BLK TOV PF PTS
0 1 Precious Achiuwa C 22 TOR 73 28 23.6 3.6 8.3 ... 0.595 2.0 4.5 6.5 1.1 0.5 0.6 1.2 2.1 9.1
1 2 Steven Adams C 28 MEM 76 75 26.3 2.8 5.1 ... 0.543 4.6 5.4 10.0 3.4 0.9 0.8 1.5 2.0 6.9
2 3 Bam Adebayo C 24 MIA 56 56 32.6 7.3 13.0 ... 0.753 2.4 7.6 10.1 3.4 1.4 0.8 2.6 3.1 19.1
3 4 Santi Aldama PF 21 MEM 32 0 11.3 1.7 4.1 ... 0.625 1.0 1.7 2.7 0.7 0.2 0.3 0.5 1.1 4.1
4 5 LaMarcus Aldridge C 36 BRK 47 12 22.3 5.4 9.7 ... 0.873 1.6 3.9 5.5 0.9 0.3 1.0 0.9 1.7 12.9
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
807 601 Thaddeus Young PF 33 TOR 26 0 18.3 2.6 5.5 ... 0.481 1.5 2.9 4.4 1.7 1.2 0.4 0.8 1.7 6.3
808 602 Trae Young PG 23 ATL 76 76 34.9 9.4 20.3 ... 0.904 0.7 3.1 3.7 9.7 0.9 0.1 4.0 1.7 28.4
809 603 Omer Yurtseven C 23 MIA 56 12 12.6 2.3 4.4 ... 0.623 1.5 3.7 5.3 0.9 0.3 0.4 0.7 1.5 5.3
810 604 Cody Zeller C 29 POR 27 0 13.1 1.9 3.3 ... 0.776 1.9 2.8 4.6 0.8 0.3 0.2 0.7 2.1 5.2
811 605 Ivica Zubac C 24 LAC 76 76 24.4 4.1 6.5 ... 0.727 2.9 5.6 8.5 1.6 0.5 1.0 1.5 2.7 10.3

812 rows × 30 columns

In [13]:
import matplotlib.pyplot as plt

position_counts = player_stats['Pos'].str[:2].value_counts()

plt.figure(figsize=(8, 8))
plt.pie(position_counts, labels=position_counts.index, autopct='%1.1f%%', startangle=140)
plt.title('Player Position Distribution')

plt.axis('equal') 
plt.show()
In [14]:
player_stats_grouped = player_stats.groupby('Player')['PTS'].mean()
player_stats_sorted = player_stats_grouped.sort_values(ascending=False)

top_30_players = player_stats_sorted.head(30)

ppg = top_30_players
players = top_30_players.index

plt.figure(figsize=(10, 6))
plt.bar(players, ppg, color='teal', alpha=0.9) 
plt.title('Top 30 Players by PPG')
plt.xlabel('Player')
plt.ylabel('PPG')

plt.xticks(rotation=90)

plt.tight_layout()
plt.show()
In [15]:
import seaborn as sns

player_stats['PTS'] = player_stats['PTS'].astype(float)  # Convert PPG to float if it's not already
player_stats_sorted = player_stats.sort_values(by='PTS', ascending=False)
top_100_players = player_stats_sorted.head(150)

# Create a pivot table to summarize PPG by position
heatmap_data = top_100_players.pivot_table(values='PTS', index='Pos', aggfunc='mean')

# Create a heatmap using seaborn
plt.figure(figsize=(12, 7))
sns.heatmap(heatmap_data, annot=True, cmap='inferno', fmt='.1f', linewidths=0.2)
plt.title('Heatmap of PPG by Player Position')
plt.xlabel('PPG')
plt.ylabel('Player Position')
plt.show()