-
Notifications
You must be signed in to change notification settings - Fork 0
/
portfolio_project_02.py
175 lines (141 loc) · 6.01 KB
/
portfolio_project_02.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
# -*- coding: utf-8 -*-
"""Portfolio Project # 02
Automatically generated by Colab.
Original file is located at
https://colab.research.google.com/drive/1TaRVExNtLxhv5xeCU0WL3WzW8SntjI1m
"""
# Importing necessary libraries for data analysis and visualization
import pandas as pd # For data manipulation and analysis
import seaborn as sns # For statistical data visualization
import matplotlib.pyplot as plt # For creating static, interactive, and animated visualizations
# Load the dataset
file_path = '/content/zomato_2.csv'
df = pd.read_csv(file_path)
# Display the first few rows of the DataFrame
df.head()
# Display the names of all columns in the DataFrame
df.columns
# Print a concise summary of the DataFrame
df.info()
# Generate descriptive statistics of the DataFrame
df.describe()
# Now try checking for duplicates
duplicate_names = df[df.duplicated('name', keep=False)]
print(f"Number of duplicate names: {duplicate_names.shape[0]}")
# Rename columns for clarity and ease of use
df = df.rename(columns={'approx_cost(for two people)': 'avg_cost_for_two', 'listed_in(city)': 'city', 'listed_in(type)': 'type'})
# Remove duplicate rows from the DataFrame
df = df.drop_duplicates()
# Calculate the number of missing values in each column
null_values = df.isnull().sum()
print(null_values)
# Determine the percentage of restaurants that accept online orders.
sns.countplot(x='online_order', data=df, palette='Set2')
plt.title('Online Delivery vs No Online Delivery')
plt.ylabel('Number of Restaurants')
plt.show()
# Analyze how table booking availability affects restaurant ratings.
avg_rating_booking = df.groupby('book_table')['rate'].mean()
avg_rating_booking.plot(kind='bar', color='lightblue')
plt.title("Average Rating vs Table Booking Availability")
plt.xlabel('Table Booking Availability')
plt.ylabel('Average Rating')
plt.show()
# Analyze how online ordering affects restaurant ratings.
avg_online_ordering = df.groupby('online_order')['rate'].mean()
avg_online_ordering .plot(kind='bar', color='lightgreen')
plt.title("Average Rating vs Online Ordering ")
plt.xlabel('Online Ordering ')
plt.ylabel('Average Rating')
plt.show()
# Explore which types of restaurants are more likely to accept online orders.
rest_type_online_order = df.groupby(['type', 'online_order']).size().unstack()
rest_type_online_order.plot(kind='bar', stacked=True, color=['#66b3ff', '#99ff99'])
plt.title("Restaurant Type vs Online Ordering")
plt.xlabel('Restaurant Type')
plt.ylabel('Number of Restaurants')
plt.show()
# Identify which locations have the highest average restaurant ratings
location_ratings = df.groupby('location')['rate'].mean().sort_values(ascending=False).head(10)
location_ratings.plot(kind='bar', color='skyblue')
plt.title("Top 10 Locations by Average Rating")
plt.xlabel('Location')
plt.ylabel('Average Rating')
plt.show()
# Identify locations with the lowest average ratings.
bottom_locations = df.groupby('location')['rate'].mean().sort_values(ascending=True).head(10)
bottom_locations.plot(kind='bar', color='lightgrey')
plt.title("Bottom 10 Locations by Average Rating")
plt.xlabel('Location')
plt.ylabel('Average Rating')
plt.show()
# Determine the most popular restaurant chains based on the number of outlets.
popular_chains = df['name'].value_counts().head(10)
popular_chains.plot(kind='bar', color='purple')
plt.title("Top 10 Most Popular Restaurant Chains")
plt.xlabel('Restaurant Name')
plt.ylabel('Number of Outlets')
plt.show()
# Identify which locations have the highest average restaurant ratings
df['rate'].plot(kind='hist', bins=20, color='orange')
plt.title("Distribution of Restaurant Ratings")
plt.xlabel('Rating')
plt.ylabel('Frequency')
plt.show()
# Analyze the relationship between the number of votes and ratings.
sns.scatterplot(x='votes', y='rate', data=df)
plt.title("Votes vs Ratings")
plt.xlabel('Number of Votes')
plt.ylabel('Rating')
plt.show()
# Identify locations with the highest number of restaurants.
location_count = df['location'].value_counts().head(10)
location_count.plot(kind='bar', color='pink')
plt.title("Top 10 Locations by Number of Restaurants")
plt.xlabel('Location')
plt.ylabel('Number of Restaurants')
plt.show()
# Analyze the impact of online ordering on the number of votes.
avg_votes_online = df.groupby('online_order')['votes'].mean()
avg_votes_online.plot(kind='bar', color='teal')
plt.title("Average Votes vs Online Ordering")
plt.xlabel('Online Ordering')
plt.ylabel('Average Votes')
plt.show()
# Identify the top 6 highest-rated restaurants.
top_rated_restaurants = df[['name', 'rate']].sort_values(by='rate', ascending=False).head(10)
# Create a horizontal bar plot
plt.figure(figsize=(10, 6))
plt.bar(top_rated_restaurants['name'], top_rated_restaurants['rate'])
plt.xlabel('Rating')
plt.ylabel('Restaurant Name')
plt.title('Top 6 Most Rated Restaurants')
plt.xticks(rotation=90)
plt.show()
# Explore how location influences the number of votes.
avg_votes_location = df.groupby('location')['votes'].mean().sort_values(ascending=False).head(10)
avg_votes_location.plot(kind='bar', color='gold')
plt.title("Average Votes by Location")
plt.xlabel('Location')
plt.ylabel('Average Votes')
plt.show()
# Identify the top 3 expensive restaurants based on cost for two people.
top5_expensive = df.nlargest(5, 'avg_cost_for_two')[['name', 'avg_cost_for_two']]
# --- Plot for Most Expensive Restaurants ---
plt.figure(figsize=(3, 3))
plt.bar(top5_expensive['name'], top5_expensive['avg_cost_for_two'], color='salmon')
plt.xlabel('Cost for Two')
plt.ylabel('Restaurant Name')
plt.title('Top 3 Most Expensive Restaurants')
plt.xticks(rotation=90) # Rotate x-axis labels by 90 degrees
plt.show()
# Identify the cheapest restaurants based on cost for two people.
cheapest = df.nsmallest(5, 'avg_cost_for_two')[['name', 'avg_cost_for_two']]
plt.figure(figsize=(10, 3))
plt.barh(cheapest['name'], cheapest['avg_cost_for_two'], color='indigo')
plt.xlabel('Cost for Two')
# Determine table booking availability
sns.countplot(x='book_table', data=df, palette='viridis')
plt.title('Table Booking vs No Table Booking')
plt.ylabel('Number of Restaurants')
plt.show()