# Load required libraries
library(tidyr)
library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library(ggplot2)
library(lubridate)
##
## Attaching package: 'lubridate'
## The following objects are masked from 'package:base':
##
## date, intersect, setdiff, union
library(scales)
# Set working directory to current directory
setwd("C:/Users/johnb/OneDrive/Desktop/ggc/fall_25/ITEC4220")
First, I loaded the dataset into R. Then, I filtered the dataset to only include films with over 5000 IMDb ratings, accounting for low-end outliers
# Reading .csv file
movies <- read.csv("TMDB_all_movies.csv")
# Filtering the dataset to include movies with over 5000 ratings
movies_filtered <- movies %>% filter(imdb_votes > 5000)
# Changing the datatype of release_date to Date in order for the graph to work correctly
movies_filtered$release_date <- as.Date(movies_filtered$release_date)
Then, I created a graph of every film’s revenue over time.
ggplot(movies_filtered, aes(x = release_date, y = revenue)) +
geom_line() +
scale_x_date(breaks = "20 years", date_labels = "%Y") +
scale_y_continuous(labels = function(x) paste0(x / 1e9, "b")) +
labs(
title = "Revenue Over Time",
x = "Year",
y = "Revenue (in billions)"
) +
theme_minimal() +
theme(plot.title = element_text(hjust = 0.5))
## Warning: Removed 10 rows containing missing values or values outside the scale range
## (`geom_line()`).
This shows that, understandably, film’s profits have substantially increased as the years have progressed.
I will be calculating the average revenue of films, grouped by the year of their release.
# Calculating the average revenue per year
filtered_mean <- movies_filtered %>%
filter(!is.na(revenue)) %>%
mutate(year = year(release_date)) %>%
group_by(year) %>%
summarise(mean_rev = mean(revenue, na.rm = TRUE)) %>%
ungroup()
# Making the graph of average revenue per year
ggplot(filtered_mean, aes(x = year, y = mean_rev)) +
geom_line() +
scale_y_continuous(labels = label_number(scale = 1e-6, suffix = "m")) +
labs(
title = "Average Movie Revenue Over Time",
x = "Year",
y = "Average Revenue"
) +
theme_minimal() +
theme(plot.title = element_text(hjust = 0.5))
## Warning: Removed 1 row containing missing values or values outside the scale range
## (`geom_line()`).
This result shows that the average film has made more money as time has progressed and film has become more and more of an industry.
Now, I will be looking at the correlation of the budget and the revenue, to see if the more money is put in to a film, the more money the film will make at the box office.
# Creating the correlation variables
filtered_corr <- movies_filtered %>%
filter(!is.na(budget), !is.na(revenue)) %>%
mutate(
budget = as.numeric(budget),
revenue = as.numeric(revenue)
)
# Calculating correlation
cor.test(filtered_corr$budget, filtered_corr$revenue, method = "pearson")
##
## Pearson's product-moment correlation
##
## data: filtered_corr$budget and filtered_corr$revenue
## t = 152.75, df = 19243, p-value < 2.2e-16
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
## 0.7338247 0.7465979
## sample estimates:
## cor
## 0.7402781
This correlation test shows a statistically-significant, strong positive linear correlation between budget and revenue. Movies with a higher budget tend to have higher revenues.
This will be a histogram of IMDb ratings, a measure of the critical success of films.
ggplot(movies_filtered, aes(x = imdb_rating)) +
geom_histogram(
binwidth = 0.5,
fill = "skyblue",
color = "black"
) +
scale_x_continuous(
breaks = seq(0, 10, by = 1)
) +
labs(
title = "Distribution of IMDb Ratings",
x = "IMDb Rating",
y = "Count"
) +
theme_minimal() +
theme(plot.title = element_text(hjust = 0.5))
This histogram shows that the majority of movies fall in the rating range of 6-7.5 out of 10. This could be used to suggest that financial success doesn’t always lead to critical success, as most films fall into a 3-4 star range regardless of revenue.
I will be splitting the dataset in two parts: higher and lower than the mean IMDb rating.
# Calculating mean rating
mean_rating <- mean(movies_filtered$imdb_rating, na.rm = TRUE)
# Creating two subsets, one above and one below the mean
above_avg <- subset(movies_filtered, imdb_rating > mean_rating)
below_avg <- subset(movies_filtered, imdb_rating < mean_rating)
Then, a t-test will be performed on both subsets’ revenues.
# Making sure revenues are numeric
above_avg$revenue <- as.numeric(above_avg$revenue)
below_avg$revenue <- as.numeric(below_avg$revenue)
# Performing t-test
t_test_result <- t.test(
above_avg$revenue,
below_avg$revenue,
alternative = "two.sided", # test if means are different
var.equal = FALSE # Welch's t-test (does not assume equal variances)
)
t_test_result
##
## Welch Two Sample t-test
##
## data: above_avg$revenue and below_avg$revenue
## t = 7.7393, df = 16829, p-value = 1.056e-14
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
## 9847067 16526593
## sample estimates:
## mean of x mean of y
## 44211851 31025021
The t-test shows that movies above the IMDb average rating tend to have significantly higher revenues compared to movies below the mean by an average of $13 million. To look at typical revenue instead of average revenue, I will also run a Wilcoxon test because it ignores extreme values.
# Performing Wilcoxon test
wilcox_test_result <- wilcox.test(
above_avg$revenue,
below_avg$revenue,
alternative = "two.sided"
)
wilcox_test_result
##
## Wilcoxon rank sum test with continuity correction
##
## data: above_avg$revenue and below_avg$revenue
## W = 45683975, p-value = 0.2224
## alternative hypothesis: true location shift is not equal to 0
This result shows that, actually, excluding outliers, there is not a significant difference in typical revenues of movies above and below the average rating. The difference is driven by high-grossing blockbusters.