import pandas as pd
df = pd.read_csv("TMDB_all_movies.csv")

df.head()

# Dropping columns that will not be used in analysis to declutter the dataset
df.drop(
    columns = ['imdb_id', 'original_language', 'original_title', 'overview',
               'tagline', 'spoken_languages', 'director_of_photography', 'music_composer',
               'poster_path'], inplace = True
)

df.head()

# Converting columns to appropriate data types
df['release_date'] = pd.to_datetime(df['release_date'], errors='coerce')
df['budget'] = pd.to_numeric(df['budget'], errors='coerce')
df['revenue'] = pd.to_numeric(df['revenue'], errors='coerce')
df['imdb_rating'] = pd.to_numeric(df['imdb_rating'], errors='coerce')
df['imdb_votes'] = pd.to_numeric(df['imdb_votes'], errors='coerce')

# Removing films with missing essential values
df = df.dropna(subset=['release_date', 'imdb_rating', 'imdb_votes', 'budget', 'revenue'])

# Filtering films to match the R analysis (post-1920 & > 5000 IMDb votes)
df_filtered = df[(df['release_date'] >= "1920-01-01") & (df['imdb_votes'] > 5000)]

df_filtered.info()

<class 'pandas.core.frame.DataFrame'>
Index: 19219 entries, 0 to 1083723
Data columns (total 19 columns):
 #   Column                Non-Null Count  Dtype         
---  ------                --------------  -----         
 0   id                    19219 non-null  int64         
 1   title                 19219 non-null  object        
 2   vote_average          19219 non-null  float64       
 3   vote_count            19219 non-null  float64       
 4   status                19219 non-null  object        
 5   release_date          19219 non-null  datetime64[ns]
 6   revenue               19219 non-null  float64       
 7   runtime               19219 non-null  float64       
 8   budget                19219 non-null  float64       
 9   popularity            19219 non-null  float64       
 10  genres                19201 non-null  object        
 11  production_companies  18896 non-null  object        
 12  production_countries  19086 non-null  object        
 13  cast                  19166 non-null  object        
 14  director              19201 non-null  object        
 15  writers               18891 non-null  object        
 16  producers             18376 non-null  object        
 17  imdb_rating           19219 non-null  float64       
 18  imdb_votes            19219 non-null  float64       
dtypes: datetime64[ns](1), float64(8), int64(1), object(9)
memory usage: 2.9+ MB

import seaborn as sns
import matplotlib.pyplot as plt

plt.figure(figsize=(10,5))
sns.lineplot(
    x=df_filtered['release_date'].dt.year,
    y=df_filtered['revenue'],
    estimator='sum',
    errorbar=None
)
plt.title('Total Revenue Over Time')
plt.xlabel('Year')
plt.ylabel('Total Revenue (USD)')
plt.grid(True)
plt.show()

plt.figure(figsize=(8,5))
sns.histplot(
    df_filtered['imdb_rating'],
    bins=20,
    kde=False,
    color='skyblue',
    edgecolor='black'
)
plt.title('Distribution of IMDb Ratings')
plt.xlabel('IMDb Rating')
plt.ylabel('Number of Films')
plt.show()

plt.figure(figsize=(8,5))

# Scatter + trendline
sns.regplot(
    x='budget',
    y='revenue',
    data=df_filtered,
    scatter_kws={'alpha':0.5},
    line_kws={'color':'orange'}
)

plt.xscale('log')
plt.yscale('log')

plt.title('Budget vs Revenue')
plt.xlabel('Budget (USD, log scale)')
plt.ylabel('Revenue (USD, log scale)')
plt.grid(True, which='both', ls='--', lw=0.5)
plt.show()

# Selecting relevant numerical columns
corr_cols = ['budget', 'revenue', 'imdb_rating', 'imdb_votes']

# Computing correlation matrices
pearson_corr = df_filtered[corr_cols].corr(method='pearson')
spearman_corr = df_filtered[corr_cols].corr(method='spearman')
kendall_corr = df_filtered[corr_cols].corr(method='kendall')

# Displaying results
print("Pearson Correlation:\n", pearson_corr, "\n")
print("Spearman Correlation:\n", spearman_corr, "\n")
print("Kendall Tau Correlation:\n", kendall_corr)

Pearson Correlation:
                budget   revenue  imdb_rating  imdb_votes
budget       1.000000  0.740239    -0.022827    0.497069
revenue      0.740239  1.000000     0.085193    0.628362
imdb_rating -0.022827  0.085193     1.000000    0.223450
imdb_votes   0.497069  0.628362     0.223450    1.000000 

Spearman Correlation:
                budget   revenue  imdb_rating  imdb_votes
budget       1.000000  0.741602    -0.103092    0.583164
revenue      0.741602  1.000000    -0.000279    0.634760
imdb_rating -0.103092 -0.000279     1.000000    0.164288
imdb_votes   0.583164  0.634760     0.164288    1.000000 

Kendall Tau Correlation:
                budget   revenue  imdb_rating  imdb_votes
budget       1.000000  0.614371    -0.075183    0.433624
revenue      0.614371  1.000000     0.000258    0.481712
imdb_rating -0.075183  0.000258     1.000000    0.112860
imdb_votes   0.433624  0.481712     0.112860    1.000000

from scipy import stats
import numpy as np

# Splitting films into above vs below mean IMDb rating
mean_rating = df_filtered['imdb_rating'].mean()

above_avg = df_filtered[df_filtered['imdb_rating'] > mean_rating]['revenue']
below_avg = df_filtered[df_filtered['imdb_rating'] <= mean_rating]['revenue']

# T-test (Welch's t-test, does not assume equal variances)
t_stat, t_pval = stats.ttest_ind(above_avg, below_avg, equal_var=False)
print(f"T-test:\n  t-statistic = {t_stat:.3f}, p-value = {t_pval:.3e}")

# Wilcoxon rank-sum test (non-parametric)
wilcox_stat, wilcox_pval = stats.ranksums(above_avg, below_avg)
print(f"Wilcoxon rank-sum test:\n  statistic = {wilcox_stat:.3f}, p-value = {wilcox_pval:.3e}")

T-test:
  t-statistic = 7.775, p-value = 7.951e-15
Wilcoxon rank-sum test:
  statistic = -1.057, p-value = 2.906e-01

from sklearn.cluster import KMeans
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import silhouette_score

# Selecting features and drop missing values
features = ['budget', 'revenue', 'imdb_rating', 'imdb_votes']
df_cluster = df_filtered[features].dropna().copy()  # Make a safe copy

# Standardizing numeric values
scaler = StandardScaler()
X_scaled = scaler.fit_transform(df_cluster)

# Run K-Means with 3 clusters
kmeans_3 = KMeans(n_clusters=3, random_state=42, n_init=10)
df_cluster['cluster_3'] = kmeans_3.fit_predict(X_scaled)

# Adding cluster labels back to original filtered DataFrame
df_filtered = df_filtered.copy()
df_filtered.loc[df_cluster.index, 'cluster_3'] = df_cluster['cluster_3']
df_filtered['cluster_3'] = df_filtered['cluster_3'].astype('category')

# Silhouette score and cluster info
sil_score_3 = silhouette_score(X_scaled, df_cluster['cluster_3'])
print(f"Silhouette Score for 3 clusters: {sil_score_3:.4f}")

print("\nCluster Sizes:")
print(df_cluster['cluster_3'].value_counts())

print("\nCluster Centers (original scale):")
centers_3 = scaler.inverse_transform(kmeans_3.cluster_centers_)
centers_3_df = pd.DataFrame(centers_3, columns=features)
print(centers_3_df)

Silhouette Score for 3 clusters: 0.3921

Cluster Sizes:
cluster_3
2    11371
0     6969
1      879
Name: count, dtype: int64

Cluster Centers (original scale):
         budget       revenue  imdb_rating     imdb_votes
0  1.306817e+07  2.088164e+07     5.407644   27031.331278
1  1.144209e+08  4.406570e+08     7.037543  500213.948805
2  6.583830e+06  1.746079e+07     7.139623   44286.039676

# Visualizing clusters
plt.figure(figsize=(10, 6))
sns.scatterplot(
    data=df_filtered,
    x='budget',
    y='revenue',
    hue='cluster_3',
    palette='tab10',
    size='imdb_votes',
    sizes=(20, 200),
    alpha=0.6
)

plt.xscale('log')
plt.yscale('log')
plt.title("K-Means Clustering of Movies (3 Clusters)\nBudget vs Revenue (Log Scale)")
plt.xlabel("Budget (log scale)")
plt.ylabel("Revenue (log scale)")
plt.legend(title='Cluster', bbox_to_anchor=(1.05, 1), loc='upper left')
plt.tight_layout()
plt.show()

# Keeping only movies within a reasonable budget and revenue range
budget_min, budget_max = 1e3, 1e9
revenue_min, revenue_max = 1e3, 1e9

df_viz = df_filtered[
    (df_filtered['budget'] >= budget_min) & (df_filtered['budget'] <= budget_max) &
    (df_filtered['revenue'] >= revenue_min) & (df_filtered['revenue'] <= revenue_max)
].copy()

plt.figure(figsize=(10, 6))

sns.scatterplot(
    data=df_viz,
    x='budget',
    y='revenue',
    hue='cluster_3',
    palette='tab10',
    size='imdb_votes',
    sizes=(20, 200),
    alpha=0.6
)

for i, row in centers_3_df.iterrows():
    plt.scatter(
        row['budget'], 
        row['revenue'], 
        marker='X',           # X marker for centroid
        s=250,                # marker size
        c='black',            # color
        edgecolor='white',    # optional: border for visibility
        label=f'Center {i}'
    )

plt.xscale('log')
plt.yscale('log')
plt.title("K-Means Clustering of Movies with Cluster Centers\nBudget vs Revenue (Log Scale)")
plt.xlabel("Budget (log)")
plt.ylabel("Revenue (log)")
plt.legend(title='Cluster', bbox_to_anchor=(1.05, 1), loc='upper left')
plt.tight_layout()
plt.show()

cluster_summary = df_viz.groupby('cluster_3', observed=False)[['imdb_votes']].mean()
cluster_summary

cluster_summary.plot(kind='bar', figsize=(10,6))
plt.title('Cluster Comparison: Number of Votes')
plt.xlabel('Cluster')
plt.ylabel('Average Value')
plt.tight_layout()
plt.show()

from scipy.stats import pearsonr

for cluster in df_filtered['cluster_3'].unique():
    subset = df_filtered[df_filtered['cluster_3'] == cluster]
    r, p = pearsonr(subset['imdb_rating'], subset['revenue'])
    print(f"Cluster {cluster}: Rating vs Revenue correlation = {r:.3f}, p = {p:.4f}")

Cluster 2.0: Rating vs Revenue correlation = -0.048, p = 0.0000
Cluster 1.0: Rating vs Revenue correlation = 0.077, p = 0.0232
Cluster 0.0: Rating vs Revenue correlation = 0.133, p = 0.0000

	id	title	vote_average	vote_count	status	release_date	revenue	runtime	budget	imdb_id	...	spoken_languages	cast	director	director_of_photography	writers	producers	music_composer	imdb_rating	imdb_votes	poster_path
0	2	Ariel	7.100	353.0	Released	1988-10-21	0.0	73.0	0.0	tt0094675	...	suomi	Marja Packalén, Olli Varja, Matti Pellonpää, J...	Aki Kaurismäki	Timo Salminen	Aki Kaurismäki	Aki Kaurismäki	NaN	7.4	9329.0	/ojDg0PGvs6R9xYFodRct2kdI6wC.jpg
1	3	Shadows in Paradise	7.291	413.0	Released	1986-10-17	0.0	74.0	0.0	tt0092149	...	suomi, English, svenska	Riikka Kuosmanen, Bertta Pellonpää, Aki Kauris...	Aki Kaurismäki	Timo Salminen	Aki Kaurismäki	Mika Kaurismäki	NaN	7.4	8166.0	/nj01hspawPof0mJmlgfjuLyJuRN.jpg
2	5	Four Rooms	5.869	2709.0	Released	1995-12-09	4257354.0	98.0	4000000.0	tt0113101	...	English	Paul Skemp, Sammi Davis, Quinn Hellerman, Davi...	Robert Rodriguez, Allison Anders, Quentin Tara...	Andrzej Sekula, Rodrigo García, Guillermo Nava...	Robert Rodriguez, Allison Anders, Quentin Tara...	Lawrence Bender, Quentin Tarantino, Alexandre ...	Combustible Edison	6.7	114887.0	/75aHn1NOYXh4M7L5shoeQ6NGykP.jpg
3	6	Judgment Night	6.500	354.0	Released	1993-10-15	12136938.0	109.0	21000000.0	tt0107286	...	English	Michael Wiseman, Michael DeLorenzo, Everlast, ...	Stephen Hopkins	Peter Levy	Jere Cunningham, Lewis Colick	Gene Levy, Marilyn Vance, Lloyd Segan	Alan Silvestri	6.6	20268.0	/3rvvpS9YPM5HB2f4HYiNiJVtdam.jpg
4	8	Life in Loops (A Megacities RMX)	7.500	27.0	Released	2006-01-01	0.0	80.0	42000.0	tt0825671	...	English, हिन्दी, 日本語, Pусский, Español	NaN	Timo Novotny	Wolfgang Thaler	Michael Glawogger, Timo Novotny	Ulrich Gehmacher, Timo Novotny	NaN	8.1	285.0	/7ln81BRnPR2wqxuITZxEciCe1lc.jpg

	id	title	vote_average	vote_count	status	release_date	revenue	runtime	budget	popularity	genres	production_companies	production_countries	cast	director	writers	producers	imdb_rating	imdb_votes
0	2	Ariel	7.100	353.0	Released	1988-10-21	0.0	73.0	0.0	1.0117	Comedy, Drama, Romance, Crime	Villealfa Filmproductions	Finland	Marja Packalén, Olli Varja, Matti Pellonpää, J...	Aki Kaurismäki	Aki Kaurismäki	Aki Kaurismäki	7.4	9329.0
1	3	Shadows in Paradise	7.291	413.0	Released	1986-10-17	0.0	74.0	0.0	0.6984	Comedy, Drama, Romance	Villealfa Filmproductions	Finland	Riikka Kuosmanen, Bertta Pellonpää, Aki Kauris...	Aki Kaurismäki	Aki Kaurismäki	Mika Kaurismäki	7.4	8166.0
2	5	Four Rooms	5.869	2709.0	Released	1995-12-09	4257354.0	98.0	4000000.0	2.6362	Comedy	Miramax, A Band Apart	United States of America	Paul Skemp, Sammi Davis, Quinn Hellerman, Davi...	Robert Rodriguez, Allison Anders, Quentin Tara...	Robert Rodriguez, Allison Anders, Quentin Tara...	Lawrence Bender, Quentin Tarantino, Alexandre ...	6.7	114887.0
3	6	Judgment Night	6.500	354.0	Released	1993-10-15	12136938.0	109.0	21000000.0	1.2895	Action, Crime, Thriller	Largo Entertainment, JVC, Universal Pictures	United States of America	Michael Wiseman, Michael DeLorenzo, Everlast, ...	Stephen Hopkins	Jere Cunningham, Lewis Colick	Gene Levy, Marilyn Vance, Lloyd Segan	6.6	20268.0
4	8	Life in Loops (A Megacities RMX)	7.500	27.0	Released	2006-01-01	0.0	80.0	42000.0	3.2030	Documentary	inLoops	Austria	NaN	Timo Novotny	Michael Glawogger, Timo Novotny	Ulrich Gehmacher, Timo Novotny	8.1	285.0

Oscar Gold, or Literal Gold? Walking the Tightrope Between Art and Commerce¶

Visualizations¶

Correlation and Statistical Measures¶

Clustering¶

	imdb_votes
cluster_3
0.0	43018.768414
1.0	486196.318293
2.0	83840.986545