import pandas as pd

awards = pd.read_csv(".../awards_data.csv")
player_data = pd.read_csv(".../player_stats.csv")
team_data = pd.read_csv(".../team_stats.csv")
rebounding_data = pd.read_csv("...team_rebounding_data_22.csv")


awards


player_data


team_data


rebounding_data


playerstats = awards.merge(player_data, on='nbapersonid', how='inner')


first_team = playerstats[(playerstats['season_x'] >= 2007) & (playerstats['season_x'] <= 2021) & 
                         (playerstats['All NBA First Team'] == 1.0)]

first_team_ppg = first_team['points'] / first_team['games']

print("First Team points per game:",first_team_ppg.mean())



sec_team = playerstats[(playerstats['season_x'] >= 2007) & (playerstats['season_x'] <= 2021) & 
                         (playerstats['All NBA Second Team'] == 1.0)]

sec_team_ppg = sec_team['points'] / sec_team['games']

print("Second Team points per game:",sec_team_ppg.mean())



third_team = playerstats[(playerstats['season_x'] >= 2007) & (playerstats['season_x'] <= 2021) & 
                         (playerstats['All NBA Third Team'] == 1.0)]

third_team_ppg = third_team['points'] / third_team['games']

print("Third Team points per game:",third_team_ppg.mean())



allstar_team = playerstats[(playerstats['season_x'] >= 2007) & (playerstats['season_x'] <= 2021) & 
                         (playerstats['all_star_game'])]

allstar_team_ppg = allstar_team['points'] / allstar_team['games']

print("All-Star Team points per game:",allstar_team_ppg.mean())

First Team points per game: 22.3096871795066
Second Team points per game: 19.602453597125052
Third Team points per game: 17.406737392538716
All-Star Team points per game: 18.73400715244671


allnba_players = playerstats[(playerstats['draftyear'] >= 2007) &
                                ((playerstats['All NBA First Team'] == 1.0) |
                                 (playerstats['All NBA Second Team'] == 1.0) |
                                 (playerstats['All NBA Third Team'] == 1.0))]

allnba_columns = ['All NBA First Team', 'All NBA Second Team', 'All NBA Third Team']
allnba_players = allnba_players[allnba_players[allnba_columns].sum(axis=1) > 0]


first_allnba_years = allnba_players.groupby('nbapersonid')['season_x'].min()

years_to_first_allnba = first_allnba_years - allnba_players.groupby('nbapersonid')['draftyear'].min() + 1

print("Average years to first All NBA selection:", years_to_first_allnba.mean())

Average years to first All NBA selection: 4.682926829268292


playerinfo = player_data.merge(awards, on='nbapersonid', how='left')
players_2010_draft = playerinfo[playerinfo['draftyear'] == 2010]
seasongames = team_data.merge(players_2010_draft, on='nbateamid', how='right')

def calculate_career_outcome(player_df):
    elite_count = 0
    all_star_count = 0
    starter_count = 0
    rotation_count = 0
    roster_count = 0
    out_of_league_count = 0
    
    for index, season in player_df.iterrows():
        if index >= 4:

            adjusted_minutes = season['mins']
            adjusted_games_started = season['games']

            if (season['All NBA First Team'] > 0) or (season['All NBA Second Team'] > 0) or (season['All NBA Third Team'] > 0):
                elite_count += 1
            elif season['all_star_game'] == True:
                all_star_count += 1
            elif (adjusted_games_started >= 41) or (adjusted_minutes >= 2000):
                starter_count += 1
            elif adjusted_minutes >= 1000 and adjusted_minutes < 2000:
                rotation_count += 1
            elif adjusted_minutes >= 1:
                roster_count += 1
    
    if elite_count >= 2:
        return "Elite"
    elif all_star_count >= 2:
        return "All-Star"
    elif starter_count >= 2:
        return "Starter"
    elif rotation_count >= 2:
        return "Rotation"
    elif roster_count >= 2:
        return "Roster"
    else:
        return "Out of the League"


results_list = []


for player, data in players_2010_draft.groupby('player'):
    career_outcome = calculate_career_outcome(data)
    results_list.append({'Player': player, 'Career Outcome': career_outcome})

results = pd.DataFrame(results_list)

outcome_counts = results['Career Outcome'].value_counts()

print(outcome_counts)

Starter              31
Roster               20
Out of the League    18
Elite                 3
All-Star              1
Name: Career Outcome, dtype: int64


import pandas as pd
import numpy as np
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report

training_data = playerstats[playerstats['draftyear'] <= 2015].copy()
training_data['points_per_game'] = training_data['points'] / training_data['games']

X = training_data[['points', 'ast', 'steals', 'blocks', 'tot_reb', 'PER', 'points_per_game']]
y = training_data['all_star_game'].copy()  # 1 if a player becomes an All-Star, 0 otherwise
y.fillna(False, inplace=True)

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

model = LogisticRegression(max_iter=1000, random_state=42)
model.fit(X_train, y_train)

y_pred = model.predict(X_test)

accuracy = accuracy_score(y_test, y_pred)
report = classification_report(y_test, y_pred)

print(f'Accuracy: {accuracy}')
print('Classification Report:')
print(report)

newplayers = playerstats[playerstats['draftyear'] >= 2018].copy()
newplayers['points_per_game'] = newplayers['points'] / newplayers['games']
new_players_data = newplayers[['points', 'ast', 'steals', 'blocks', 'tot_reb', 'PER', 'points_per_game']]
predictions = model.predict(new_players_data)

newplayers['predicted_all_star'] = predictions

all_star_players = newplayers[newplayers['predicted_all_star'] == True]

grouped_players = all_star_players.groupby('player')['predicted_all_star'].max()


print(grouped_players)

Accuracy: 0.8790214477211796
Classification Report:
              precision    recall  f1-score   support

       False       0.89      0.98      0.93      5102
        True       0.70      0.30      0.41       866

    accuracy                           0.88      5968
   macro avg       0.79      0.64      0.67      5968
weighted avg       0.86      0.88      0.86      5968

player
Ja Morant                  True
Luka Doncic                True
Luka Dončić                True
Shai Gilgeous-Alexander    True
Trae Young                 True
Zion Williamson            True
Name: predicted_all_star, dtype: bool


okc_data = rebounding_data[rebounding_data['team'] == 'OKC']

average_offensive_rebound_percent = okc_data.iloc[:80]['offensive_rebounds'].sum() / okc_data.iloc[:80]['off_rebound_chances'].sum()

predicted_offensive_rebound_percent = average_offensive_rebound_percent * 100

print("Predicted Offensive Rebound Percentage for Game 81:", predicted_offensive_rebound_percent)

Predicted Offensive Rebound Percentage for Game 81: 28.8689755388714


player_stats = pd.read_csv("C:/Users/shari/OneDrive/Desktop/Job Folder/OKC Project/Datasets/2021playerstats.csv", sep=';', encoding='latin1')

player_stats


import matplotlib.pyplot as plt

position_counts = player_stats['Pos'].str[:2].value_counts()

plt.figure(figsize=(8, 8))
plt.pie(position_counts, labels=position_counts.index, autopct='%1.1f%%', startangle=140)
plt.title('Player Position Distribution')

plt.axis('equal') 
plt.show()


player_stats_grouped = player_stats.groupby('Player')['PTS'].mean()
player_stats_sorted = player_stats_grouped.sort_values(ascending=False)

top_30_players = player_stats_sorted.head(30)

ppg = top_30_players
players = top_30_players.index

plt.figure(figsize=(10, 6))
plt.bar(players, ppg, color='teal', alpha=0.9) 
plt.title('Top 30 Players by PPG')
plt.xlabel('Player')
plt.ylabel('PPG')

plt.xticks(rotation=90)

plt.tight_layout()
plt.show()


import seaborn as sns

player_stats['PTS'] = player_stats['PTS'].astype(float)  # Convert PPG to float if it's not already
player_stats_sorted = player_stats.sort_values(by='PTS', ascending=False)
top_100_players = player_stats_sorted.head(150)

# Create a pivot table to summarize PPG by position
heatmap_data = top_100_players.pivot_table(values='PTS', index='Pos', aggfunc='mean')

# Create a heatmap using seaborn
plt.figure(figsize=(12, 7))
sns.heatmap(heatmap_data, annot=True, cmap='inferno', fmt='.1f', linewidths=0.2)
plt.title('Heatmap of PPG by Player Position')
plt.xlabel('PPG')
plt.ylabel('Player Position')
plt.show()

	season	nbapersonid	All NBA Defensive First Team	All NBA Defensive Second Team	All NBA First Team	All NBA Second Team	All NBA Third Team	All Rookie First Team	All Rookie Second Team	Bill Russell NBA Finals MVP	...	all_star_game	rookie_all_star_game	allstar_rk	Defensive Player Of The Year_rk	Most Improved Player_rk	Most Valuable Player_rk	Rookie Of The Year_rk	Sixth Man Of The Year_rk	all_nba_points_rk	all_rookie_points_rk
0	2007	708.0	1.0	0.0	1.0	0.0	0.0	0.0	0.0	0.0	...	True	False	1.0	1.0	NaN	3.0	NaN	NaN	NaN	NaN
1	2007	947.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	...	True	False	2.0	NaN	NaN	NaN	NaN	NaN	NaN	NaN
2	2007	948.0	1.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	...	NaN	NaN	3.0	2.0	NaN	NaN	NaN	NaN	NaN	NaN
3	2007	959.0	0.0	0.0	0.0	1.0	0.0	0.0	0.0	0.0	...	True	False	4.0	NaN	NaN	9.0	NaN	NaN	NaN	NaN
4	2007	977.0	1.0	0.0	1.0	0.0	0.0	0.0	0.0	0.0	...	True	False	1.0	5.0	NaN	1.0	NaN	NaN	NaN	NaN
...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...
4324	2015	1626170.0	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	...	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	24.0
4325	2015	1626202.0	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	...	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	24.0
4326	2015	1626273.0	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	...	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	24.0
4327	2018	1628971.0	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	...	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	18.0
4328	2020	1630214.0	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	...	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	18.0

	nbapersonid	player	draftyear	draftpick	season	nbateamid	team	games	games_start	mins	...	blk_pct	tov_pct	usg	OWS	DWS	WS	OBPM	DBPM	BPM	VORP
0	2585	Zaza Pachulia	2003	42.0	2007	1610612737	ATL	62	5	944	...	0.010	0.181	0.183	0.2	0.9	1.1	-3.9	-1.3	-5.1	-0.7
1	200780	Solomon Jones	2006	33.0	2007	1610612737	ATL	35	0	145	...	0.026	0.221	0.156	-0.1	0.1	0.0	-6.7	-2.0	-8.8	-0.2
2	2746	Josh Smith	2004	17.0	2007	1610612737	ATL	81	81	2873	...	0.059	0.155	0.250	1.2	4.6	5.8	0.5	2.5	3.0	3.7
3	201151	Acie Law	2007	11.0	2007	1610612737	ATL	56	6	865	...	0.000	0.178	0.165	-0.5	0.4	-0.1	-4.2	-1.0	-5.2	-0.7
4	101136	Salim Stoudamire	2005	31.0	2007	1610612737	ATL	35	0	402	...	0.009	0.094	0.252	0.1	0.1	0.3	-1.0	-2.5	-3.5	-0.1
...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...
8487	1630648	Jordan Schakel	2021	NaN	2021	1610612764	WAS	4	0	30	...	0.000	0.078	0.191	-0.2	0.0	-0.1	-8.6	-4.4	-13.0	-0.1
8488	1630557	Corey Kispert	2021	15.0	2021	1610612764	WAS	77	36	1801	...	0.010	0.085	0.146	1.6	0.7	2.3	-0.8	-1.5	-2.3	-0.1
8489	1628398	Kyle Kuzma	2017	27.0	2021	1610612764	WAS	66	66	2204	...	0.022	0.141	0.242	0.0	2.0	2.0	0.2	-0.4	-0.2	1.0
8490	203526	Raul Neto	2013	47.0	2021	1610612764	WAS	70	19	1372	...	0.002	0.139	0.184	0.7	0.8	1.5	-2.5	-0.5	-3.0	-0.4
8491	1628418	Thomas Bryant	2017	42.0	2021	1610612764	WAS	27	9	439	...	0.041	0.103	0.187	0.7	0.4	1.1	-0.4	-0.7	-1.0	0.1

	nbateamid	team	season	games	off_rtg	def_rtg	net_rtg	W	L
0	1610612737	ATL	2007	82	106.9	108.9	-2.0	37	45
1	1610612751	BKN	2007	82	104.0	109.4	-5.4	34	48
2	1610612738	BOS	2007	82	110.2	98.9	11.3	66	16
3	1610612766	CHA	2007	82	104.6	109.4	-4.8	32	50
4	1610612741	CHI	2007	82	103.9	107.2	-3.3	33	49
...	...	...	...	...	...	...	...	...	...
445	1610612758	SAC	2021	82	109.9	115.3	-5.4	30	52
446	1610612759	SAS	2021	82	112.4	112.3	0.1	34	48
447	1610612761	TOR	2021	82	112.9	110.5	2.4	48	34
448	1610612762	UTA	2021	82	116.7	110.5	6.2	49	33
449	1610612764	WAS	2021	82	111.1	114.5	-3.4	35	47

	team	opp_team	gamedate	game_number	offensive_rebounds	off_rebound_chances	oreb_pct
0	BOS	PHI	2022-10-18	1	10	39	0.256410
1	PHI	BOS	2022-10-18	1	8	42	0.190476
2	GSW	LAL	2022-10-18	1	16	57	0.280702
3	LAL	GSW	2022-10-18	1	14	57	0.245614
4	ORL	DET	2022-10-19	1	13	47	0.276596
...	...	...	...	...	...	...	...
2455	LAC	PHX	2023-04-09	82	18	56	0.321429
2456	MEM	OKC	2023-04-09	82	12	55	0.218182
2457	POR	GSW	2023-04-09	82	11	61	0.180328
2458	SAC	DEN	2023-04-09	82	12	50	0.240000
2459	MIN	NOP	2023-04-09	82	11	49	0.224490

	Rk	Player	Pos	Age	Tm	G	GS	MP	FG	FGA	...	FT%	ORB	DRB	TRB	AST	STL	BLK	TOV	PF	PTS
0	1	Precious Achiuwa	C	22	TOR	73	28	23.6	3.6	8.3	...	0.595	2.0	4.5	6.5	1.1	0.5	0.6	1.2	2.1	9.1
1	2	Steven Adams	C	28	MEM	76	75	26.3	2.8	5.1	...	0.543	4.6	5.4	10.0	3.4	0.9	0.8	1.5	2.0	6.9
2	3	Bam Adebayo	C	24	MIA	56	56	32.6	7.3	13.0	...	0.753	2.4	7.6	10.1	3.4	1.4	0.8	2.6	3.1	19.1
3	4	Santi Aldama	PF	21	MEM	32	0	11.3	1.7	4.1	...	0.625	1.0	1.7	2.7	0.7	0.2	0.3	0.5	1.1	4.1
4	5	LaMarcus Aldridge	C	36	BRK	47	12	22.3	5.4	9.7	...	0.873	1.6	3.9	5.5	0.9	0.3	1.0	0.9	1.7	12.9
...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...
807	601	Thaddeus Young	PF	33	TOR	26	0	18.3	2.6	5.5	...	0.481	1.5	2.9	4.4	1.7	1.2	0.4	0.8	1.7	6.3
808	602	Trae Young	PG	23	ATL	76	76	34.9	9.4	20.3	...	0.904	0.7	3.1	3.7	9.7	0.9	0.1	4.0	1.7	28.4
809	603	Omer Yurtseven	C	23	MIA	56	12	12.6	2.3	4.4	...	0.623	1.5	3.7	5.3	0.9	0.3	0.4	0.7	1.5	5.3
810	604	Cody Zeller	C	29	POR	27	0	13.1	1.9	3.3	...	0.776	1.9	2.8	4.6	0.8	0.3	0.2	0.7	2.1	5.2
811	605	Ivica Zubac	C	24	LAC	76	76	24.4	4.1	6.5	...	0.727	2.9	5.6	8.5	1.6	0.5	1.0	1.5	2.7	10.3

Setup and Data¶

Data Cleaning Interlude¶

Open Ended Modeling¶

Predicting Team Stats¶

Visualizing Data¶