Top50moviesp44091 2 2

top50moviesp44091-2
April 12, 2024
[4]: import matplotlib.pyplot as plt

import pandas as pd
import numpy as np
import seaborn as sns
%matplotlib inline
[5]: df=pd.read_excel("/Users/nairirtisharma/Desktop/Top50Movies.xlsx")
[6]: df.columns
[6]: Index(['Title', 'Year Released', 'Budget (Inflation Adjusted Millions $)',

'World Box Office Receipts (Inflation Adjusted Millions $)',
'U.S. Box Office Receipts (Inflation Adjusted Millions $)', 'Rating',
'Genre', 'Budget (Non-Inflation Adjusted Millions $)',
'World Box Office Receipts (Non-Inflation Adjusted Millions $)',
'U.S. Box Office Receipts (Non-Inflation Adjusted Millions $)'],
dtype='object')
[42]: # Assuming df is your DataFrame

description = df.describe()
plt.figure(figsize=(10, 6))
other_description.plot(kind='bar', color=['lightgreen', 'lightcoral',␣
↪'lightblue', 'orange'])
plt.title('Other Descriptive Statistics')

plt.xlabel('Statistic')
plt.ylabel('Value')
plt.xticks(rotation=0)
plt.legend(loc='upper right')
plt.grid(axis='y')
plt.show()
<Figure size 1000x600 with 0 Axes>
1
[40]: df.nlargest(50, 'U.S. Box Office Receipts (Inflation Adjusted Millions $)')
sns.scatterplot(data=df, x='Year Released', y='World Box Office Receipts␣
↪(Inflation Adjusted Millions $)',hue= 'Genre')
sns.regplot(data=df, x='Year Released', y='World Box Office Receipts (Inflation␣

↪Adjusted Millions $)', scatter=False)
plt.title('Relationship Between Year Released and Inflation-Adjusted World Box␣

↪Office Receipts')
plt.xlabel('Year Released')
plt.ylabel('Inflation-Adjusted Worlf Box Office Receipts (Millions $)')
plt.grid(True)
plt.tight_layout()
plt.show()
#INTERPRETATION : In both 1940s and 1990-2000a the highest box office␣
↪collecection have been for th Genre "Drama" ,
2
#Box office collection have been increasing for all Genre of movies in the␣
↪World more so the "Genre drama has not been prevalent after 1970-1980 .
[15]: df.nlargest(50, 'U.S. Box Office Receipts (Inflation Adjusted Millions $)')
sns.scatterplot(data=df, x='Year Released', y='U.S. Box Office Receipts␣
↪(Inflation Adjusted Millions $)',hue= 'Genre')
sns.regplot(data=df, x='Year Released', y='U.S. Box Office Receipts (Inflation␣

↪Adjusted Millions $)', scatter=False)
plt.title('Relationship Between Year Released and Inflation-Adjusted U.S. Box␣

↪Office Receipts')
plt.xlabel('Year Released')
plt.ylabel('Inflation-Adjusted U.S. Box Office Receipts (Millions $)')
plt.grid(True)
plt.tight_layout()
plt.show()
import warnings
warnings.filterwarnings("ignore")
3
#INTERPRETATION: In the US the Box - office collection has been decreasing ,␣
↪with less movies from the 'Genre' Drama and more movies from 'Genre' Scifi␣
↪Fantasy
[12]: plt.figure(figsize=(30, 30))

sns.pairplot(df)
import warnings
warnings.filterwarnings("ignore")
<Figure size 3000x3000 with 0 Axes>
4
[55]: import plotly.express as px
[44]: df["Budget (Inflation Adjusted Millions $)"] = pd.to_numeric(df["Budget␣

↪(Inflation Adjusted Millions $)"], errors='coerce')
# To remove rows where the conversion failed (NaN values)

df = df.dropna(subset=["Budget (Inflation Adjusted Millions $)"])
px.scatter(df, x="Budget (Inflation Adjusted Millions $)", y="World Box Office␣

↪Receipts (Non-Inflation Adjusted Millions $)",
color="Rating", hover_name="Genre", title="Budget vs. World␣

↪Box Office Receipts",trendline='ols')
5
#for PG rated movies with reasonable increase in budget the collection has␣
↪increased
#Even with more budget the collection remaind the same for UR rated movies
#With more budget the collection decreased for G rated movies both for US and␣
↪WORLD
[122]: px.scatter(df, x="Budget (Inflation Adjusted Millions $)", y="U.S. Box Office␣
↪Receipts (Non-Inflation Adjusted Millions $)",
color="Rating", hover_name="Genre", title="Budget vs. World␣

↪Box Office Receipts",trendline='ols')
[68]: genre_counts = df['Genre'].value_counts()
[69]: plt.pie(genre_counts, labels=genre_counts.index, autopct='%1.1f%%',␣

↪startangle=140)
plt.title('Distribution of Movies by Genre')

plt.axis('equal')
plt.show()
6
[56]: df=df.dropna(subset=['U.S. Box Office Receipts (Inflation Adjusted Millions␣
↪$)'])
bin_edges = list(range(0, int(df['U.S. Box Office Receipts (Inflation Adjusted␣

↪Millions $)'].max()) + 100, 100))
frequency_distribution = pd.cut(df['U.S. Box Office Receipts (Inflation␣

↪Adjusted Millions $)'], bins=bin_edges).value_counts().sort_index()
percent_frequency_distribution = (frequency_distribution /␣
↪frequency_distribution.sum()) * 100
plt.hist(df['U.S. Box Office Receipts (Inflation Adjusted Millions $)'],␣
↪bins=bin_edges, edgecolor='black', alpha=0.7)
plt.xlabel('Inflation-adjusted U.S. Box Office Receipts (Millions $)')

plt.ylabel('Frequency')
plt.title('Histogram of Inflation-adjusted U.S. Box Office Receipts')
7
plt.grid(True)
plt.show()
print("Frequency Distribution:")
print(frequency_distribution)
print()
print("Percent Frequency Distribution:")

print(percent_frequency_distribution)
# The outliers are the box office collection above $1500 million
Frequency Distribution:
U.S. Box Office Receipts (Inflation Adjusted Millions $)
(0, 100] 0
(100, 200] 0
(200, 300] 0
(300, 400] 0
(400, 500] 1
(500, 600] 14
(600, 700] 6
(700, 800] 5
(800, 900] 4
(900, 1000] 1
(1000, 1100] 3
8
(1100, 1200] 0
(1200, 1300] 0
(1300, 1400] 0
(1400, 1500] 1
(1500, 1600] 0
(1600, 1700] 1
Name: count, dtype: int64
Percent Frequency Distribution:

U.S. Box Office Receipts (Inflation Adjusted Millions $)
(0, 100] 0.000000
(100, 200] 0.000000
(200, 300] 0.000000
(300, 400] 0.000000
(400, 500] 2.777778
(500, 600] 38.888889
(600, 700] 16.666667
(700, 800] 13.888889
(800, 900] 11.111111
(900, 1000] 2.777778
(1000, 1100] 8.333333
(1100, 1200] 0.000000
(1200, 1300] 0.000000
(1300, 1400] 0.000000
(1400, 1500] 2.777778
(1500, 1600] 0.000000
(1600, 1700] 2.777778
Name: count, dtype: float64
[48]: df["Budget (Inflation Adjusted Millions $)"] = pd.to_numeric(df["Budget␣

↪(Inflation Adjusted Millions $)"], errors='coerce')
# To remove rows where the conversion failed (NaN values)

df = df.dropna(subset=["Budget (Inflation Adjusted Millions $)"])
pivot_top_50 = pd.pivot_table(df.head(50), index='Genre', columns='Rating',␣
↪aggfunc='size', fill_value=0)
print("Cross-tabulation for movie genre and rating for the top 50 movies:")
print(pivot_top_50)
print()
df1_after_1980 = df[df['Year Released'] >= 1980]
pivot_after_1980 = pd.pivot_table(df1_after_1980, index='Genre',␣

↪columns='Rating', aggfunc='size', fill_value=0)
9
print("Cross-tabulation for movie genre and rating for movies released after␣
↪1980:")
print(pivot_after_1980)
print()
pivot_avg_receipts = pd.pivot_table(df, index='Genre', columns='Rating',␣

↪values='U.S. Box Office Receipts (Inflation Adjusted Millions $)',␣
↪aggfunc='mean')
print("Average inflation-adjusted U.S. box office receipts for each␣

↪genre-rating pair:")
print(pivot_avg_receipts)
#
Cross-tabulation for movie genre and rating for the top 50 movies:
Rating G PG PG-13 R UR
Genre
Action 1 2 3 0 0
Animated 5 1 0 0 0
Comedy 0 1 0 0 0
Drama 4 2 3 1 2
Horror 0 0 0 1 0
SciFi/Fantasy 0 5 5 0 0
Cross-tabulation for movie genre and rating for movies released after 1980:
Rating G PG PG-13
Genre
Action 0 1 3
Animated 1 1 0
Drama 0 0 2
SciFi/Fantasy 0 4 5
Average inflation-adjusted U.S. box office receipts for each genre-rating pair:
Rating G PG PG-13 R UR
Genre
Action 506.600 856.35 664.100000 NaN NaN
Animated 665.140 557.40 NaN NaN NaN
Comedy NaN 561.90 NaN NaN NaN
Drama 1016.175 615.10 895.766667 622.8 519.85
Horror NaN NaN NaN 808.7 NaN
SciFi/Fantasy NaN 834.58 561.860000 NaN NaN
[ ]: #Cross-tabulation for movie genre and rating for the top 50 movies:
#This table shows the count of movies for each combination of genre and rating␣
↪among the top 50 movies.
#Insights:
10
#The most represented genre-rating pairs are Drama-PG-13 and SciFi/Fantasy-PG,␣
↪each appearing 5 times.Horror movies are represented by only one movie with␣
↪an R rating.No movies in the top 50 are rated UR (Unrated).
#Cross-tabulation for movie genre and rating for movies released after 1980:
#This table presents the count of movies for each genre-rating combination␣
↪among movies released after 1980.
#Insights:PG-13 rated SciFi/Fantasy and Drama movies are the most common, each␣
↪appearing in 5 movies.Action movies with a PG-13 rating follow closely,␣
↪appearing in 3 movies.
#Average inflation-adjusted U.S. box office receipts for each genre-rating pair:
↪This table displays the average box office receipts for each genre-rating␣
↪combination.
#Insights: #Drama movies with a G rating have the highest average box office␣
↪receipts at $1016.175 million.
#Horror movies with an R rating follow with an average of $808.7 million.

#Animated movies with a G rating also have a relatively high average box office␣
↪performance at $665.14 million.
#Comedy movies with a PG rating have an average box office receipts of $561.90␣
↪million.
#Action movies with a PG rating have an average box office receipts of $856.35␣
↪million.
#SciFi/Fantasy movies with a PG rating also have a high average box office␣
↪receipts of $834.58 million.
11

Top50moviesp44091 2 2

Uploaded by

Document Information

Original Title

Copyright

Available Formats

Share this document

Share or Embed Document

Sharing Options

Did you find this document useful?

Is this content inappropriate?

Copyright:

Available Formats

Top50moviesp44091 2 2

Uploaded by

Copyright:

Available Formats

top50moviesp44091-2

April 12, 2024

[4]: import matplotlib.pyplot as plt

[6]: Index(['Title', 'Year Released', 'Budget (Inflation Adjusted Millions $)',

[42]: # Assuming df is your DataFrame

plt.title('Other Descriptive Statistics')

<Figure size 1000x600 with 0 Axes>

sns.regplot(data=df, x='Year Released', y='World Box Office Receipts (Inflation␣

plt.title('Relationship Between Year Released and Inflation-Adjusted World Box␣

sns.regplot(data=df, x='Year Released', y='U.S. Box Office Receipts (Inflation␣

plt.title('Relationship Between Year Released and Inflation-Adjusted U.S. Box␣

[12]: plt.figure(figsize=(30, 30))

<Figure size 3000x3000 with 0 Axes>

[44]: df["Budget (Inflation Adjusted Millions $)"] = pd.to_numeric(df["Budget␣

# To remove rows where the conversion failed (NaN values)

px.scatter(df, x="Budget (Inflation Adjusted Millions $)", y="World Box Office␣

color="Rating", hover_name="Genre", title="Budget vs. World␣

color="Rating", hover_name="Genre", title="Budget vs. World␣

[68]: genre_counts = df['Genre'].value_counts()

[69]: plt.pie(genre_counts, labels=genre_counts.index, autopct='%1.1f%%',␣

plt.title('Distribution of Movies by Genre')

bin_edges = list(range(0, int(df['U.S. Box Office Receipts (Inflation Adjusted␣

frequency_distribution = pd.cut(df['U.S. Box Office Receipts (Inflation␣

plt.xlabel('Inflation-adjusted U.S. Box Office Receipts (Millions $)')

print("Percent Frequency Distribution:")

Percent Frequency Distribution:

[48]: df["Budget (Inflation Adjusted Millions $)"] = pd.to_numeric(df["Budget␣

# To remove rows where the conversion failed (NaN values)

df1_after_1980 = df[df['Year Released'] >= 1980]

pivot_after_1980 = pd.pivot_table(df1_after_1980, index='Genre',␣

pivot_avg_receipts = pd.pivot_table(df, index='Genre', columns='Rating',␣

print("Average inflation-adjusted U.S. box office receipts for each␣

↪an R rating.No movies in the top 50 are rated UR (Unrated).

#Horror movies with an R rating follow with an average of $808.7 million.

You might also like