# IMDB dataset link in kaggle :https://www.kaggle.com/datasets/PromptCloudHQ/imdb-data
#import libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
#create the dataframe
df=pd.read_csv("IMDB-Movie-Data.csv")
df.head()
df.shape
Output:
(1000, 12)
#To get information about dataset
df.info()
#check missing values in the dataset
df.isnull().sum()
Output:
sns.heatmap(df.isnull())
plt.show()
#To get missing value percentage
percentage_missing=df.isnull().sum()*100/len(df)
print(percentage_missing)
#drop all missing values
df.dropna(axis=0)
Output:
#check for duplicate data
duplicate_data=df.duplicated().any()
print(duplicate_data)
False
#Tp get statistics about the dataframe
df.describe(include='all')
Output:
#display title of the movie having runtime >=180minutes
df.columns
Output:
Index(['Rank', 'Title', 'Genre', 'Description', 'Director', 'Actors', 'Year',
Index(['Rank', 'Title', 'Genre', 'Description', 'Director', 'Actors', 'Year', 'Runtime (Minutes)', 'Rating', 'Votes', 'Revenue (Millions)', 'Metascore'],
dtype='object')
df[df['Runtime (Minutes)']>=180]['Title']
Output:
#In which year there was the highest average voting
df.groupby('Year')['Votes'].mean().sort_values(ascending=False)
Output:
Year
2012 285226.093750
2008 275505.384615
2006 269289.954545
2009 255780.647059
2010 252782.316667
2007 244331.037736
2011 240790.301587
2013 219049.648352
2014 203930.224490
2015 115726.220472
2016 48591.754209
Name: Votes, dtype: float64
sns.barplot(x='Year',y='Votes',data=df)
plt.title("Average votes by year")
plt.show()
#In which year there was the highest average revenue
df.columns
Output:
Index(['Rank', 'Title', 'Genre', 'Description', 'Director', 'Actors', 'Year', 'Runtime (Minutes)', 'Rating', 'Votes', 'Revenue (Millions)', 'Metascore'], dtype='object')
df.groupby('Director')['Rating'].mean().sort_values(ascending=False)
Output:
Director
Nitesh Tiwari 8.80
Christopher Nolan 8.68
Olivier Nakache 8.60
Makoto Shinkai 8.60
Aamir Khan 8.50
...
Micheal Bafaro 3.50
Jonathan Holbrook 3.20
Shawn Burkett 2.70
James Wong 2.70
Jason Friedberg 1.90
Name: Rating, Length: 644, dtype: float64
#To display top10 lengthy movies Title and Runtime
df.columns
Output:
Index(['Rank', 'Title', 'Genre', 'Description', 'Director', 'Actors', 'Year', 'Runtime (Minutes)', 'Rating', 'Votes', 'Revenue (Millions)', 'Metascore'], dtype='object')
top10=df.nlargest(10,'Runtime (Minutes)')[['Title','Runtime (Minutes)']]\
.set_index('Title')
print(top10)
Runtime (Minutes)
Title
Grindhouse 191
The Hateful Eight 187
The Wolf of Wall Street 180
La vie d'Adèle 180
Inland Empire 180
Cloud Atlas 172
3 Idiots 170
Interstellar 169
Pirates of the Caribbean: At World's End 169
The Hobbit: An Unexpected Journey 169
sns.barplot(x='Runtime (Minutes)',y=top10.index,data=top10)
plt.show()
#To get no. of movies per year
df.columns
Output:
Index(['Rank', 'Title', 'Genre', 'Description', 'Director', 'Actors', 'Year', 'Runtime (Minutes)', 'Rating', 'Votes', 'Revenue (Millions)', 'Metascore'], dtype='object')
df['Year'].value_counts()
Output:
2016 297
2015 127
2014 98
2013 91
2012 64
2011 63
2010 60
2007 53
2008 52
2009 51
2006 44
Name: Year, dtype: int64
sns.countplot(x='Year',data=df)
plt.show()
#Find most popular movie title (Highest Revenue)
df.columns
Index(['Rank', 'Title', 'Genre', 'Description', 'Director', 'Actors', 'Year', 'Runtime (Minutes)', 'Rating', 'Votes', 'Revenue (Millions)', 'Metascore'], dtype='object')
df[df['Revenue (Millions)'].max()==df['Revenue (Millions)']]['Title']
Output:
50 Star Wars: Episode VII - The Force Awakens
Name: Title, dtype: object
Silan Software is one of the India's leading provider of offline & online training for Java, Python, AI (Machine Learning, Deep Learning), Data Science, Software Development & many more emerging Technologies.
We provide Academic Training || Industrial Training || Corporate Training || Internship || Java || Python || AI using Python || Data Science etc