portfolio_project_02.py

# -*- coding: utf-8 -*-
"""Portfolio Project # 02

Automatically generated by Colab.

Original file is located at
    https://colab.research.google.com/drive/1TaRVExNtLxhv5xeCU0WL3WzW8SntjI1m
"""

# Importing necessary libraries for data analysis and visualization
import pandas as pd  # For data manipulation and analysis
import seaborn as sns  # For statistical data visualization
import matplotlib.pyplot as plt  # For creating static, interactive, and animated visualizations

# Load the dataset
file_path = '/content/zomato_2.csv'
df = pd.read_csv(file_path)

# Display the first few rows of the DataFrame
df.head()

# Display the names of all columns in the DataFrame
df.columns

# Print a concise summary of the DataFrame
df.info()

# Generate descriptive statistics of the DataFrame
df.describe()

# Now try checking for duplicates
duplicate_names = df[df.duplicated('name', keep=False)]
print(f"Number of duplicate names: {duplicate_names.shape[0]}")

# Rename columns for clarity and ease of use
df = df.rename(columns={'approx_cost(for two people)': 'avg_cost_for_two', 'listed_in(city)': 'city', 'listed_in(type)': 'type'})

# Remove duplicate rows from the DataFrame
df = df.drop_duplicates()

# Calculate the number of missing values in each column
null_values = df.isnull().sum()
print(null_values)

# Determine the percentage of restaurants that accept online orders.
sns.countplot(x='online_order', data=df, palette='Set2')
plt.title('Online Delivery vs No Online Delivery')
plt.ylabel('Number of Restaurants')
plt.show()

# Analyze how table booking availability affects restaurant ratings.
avg_rating_booking = df.groupby('book_table')['rate'].mean()
avg_rating_booking.plot(kind='bar', color='lightblue')
plt.title("Average Rating vs Table Booking Availability")
plt.xlabel('Table Booking Availability')
plt.ylabel('Average Rating')
plt.show()

# Analyze how online ordering affects restaurant ratings.
avg_online_ordering = df.groupby('online_order')['rate'].mean()
avg_online_ordering .plot(kind='bar', color='lightgreen')
plt.title("Average Rating vs Online Ordering ")
plt.xlabel('Online Ordering ')
plt.ylabel('Average Rating')
plt.show()

# Explore which types of restaurants are more likely to accept online orders.
rest_type_online_order = df.groupby(['type', 'online_order']).size().unstack()
rest_type_online_order.plot(kind='bar', stacked=True, color=['#66b3ff', '#99ff99'])
plt.title("Restaurant Type vs Online Ordering")
plt.xlabel('Restaurant Type')
plt.ylabel('Number of Restaurants')
plt.show()

# Identify which locations have the highest average restaurant ratings
location_ratings = df.groupby('location')['rate'].mean().sort_values(ascending=False).head(10)
location_ratings.plot(kind='bar', color='skyblue')
plt.title("Top 10 Locations by Average Rating")
plt.xlabel('Location')
plt.ylabel('Average Rating')
plt.show()

# Identify locations with the lowest average ratings.
bottom_locations = df.groupby('location')['rate'].mean().sort_values(ascending=True).head(10)
bottom_locations.plot(kind='bar', color='lightgrey')
plt.title("Bottom 10 Locations by Average Rating")
plt.xlabel('Location')
plt.ylabel('Average Rating')
plt.show()

# Determine the most popular restaurant chains based on the number of outlets.
popular_chains = df['name'].value_counts().head(10)
popular_chains.plot(kind='bar', color='purple')
plt.title("Top 10 Most Popular Restaurant Chains")
plt.xlabel('Restaurant Name')
plt.ylabel('Number of Outlets')
plt.show()

# Identify which locations have the highest average restaurant ratings
df['rate'].plot(kind='hist', bins=20, color='orange')
plt.title("Distribution of Restaurant Ratings")
plt.xlabel('Rating')
plt.ylabel('Frequency')
plt.show()

# Analyze the relationship between the number of votes and ratings.
sns.scatterplot(x='votes', y='rate', data=df)
plt.title("Votes vs Ratings")
plt.xlabel('Number of Votes')
plt.ylabel('Rating')
plt.show()

# Identify locations with the highest number of restaurants.
location_count = df['location'].value_counts().head(10)
location_count.plot(kind='bar', color='pink')
plt.title("Top 10 Locations by Number of Restaurants")
plt.xlabel('Location')
plt.ylabel('Number of Restaurants')
plt.show()

# Analyze the impact of online ordering on the number of votes.
avg_votes_online = df.groupby('online_order')['votes'].mean()
avg_votes_online.plot(kind='bar', color='teal')
plt.title("Average Votes vs Online Ordering")
plt.xlabel('Online Ordering')
plt.ylabel('Average Votes')
plt.show()

# Identify the top 6 highest-rated restaurants.
top_rated_restaurants = df[['name', 'rate']].sort_values(by='rate', ascending=False).head(10)

# Create a horizontal bar plot
plt.figure(figsize=(10, 6))
plt.bar(top_rated_restaurants['name'], top_rated_restaurants['rate'])

plt.xlabel('Rating')
plt.ylabel('Restaurant Name')
plt.title('Top 6 Most Rated Restaurants')
plt.xticks(rotation=90)
plt.show()

# Explore how location influences the number of votes.
avg_votes_location = df.groupby('location')['votes'].mean().sort_values(ascending=False).head(10)
avg_votes_location.plot(kind='bar', color='gold')
plt.title("Average Votes by Location")
plt.xlabel('Location')
plt.ylabel('Average Votes')
plt.show()

# Identify the top 3 expensive restaurants based on cost for two people.
top5_expensive = df.nlargest(5, 'avg_cost_for_two')[['name', 'avg_cost_for_two']]

# --- Plot for Most Expensive Restaurants ---
plt.figure(figsize=(3, 3))
plt.bar(top5_expensive['name'], top5_expensive['avg_cost_for_two'], color='salmon')
plt.xlabel('Cost for Two')
plt.ylabel('Restaurant Name')
plt.title('Top 3 Most Expensive Restaurants')
plt.xticks(rotation=90)  # Rotate x-axis labels by 90 degrees
plt.show()

# Identify the cheapest restaurants based on cost for two people.
cheapest = df.nsmallest(5, 'avg_cost_for_two')[['name', 'avg_cost_for_two']]


plt.figure(figsize=(10, 3))

plt.barh(cheapest['name'], cheapest['avg_cost_for_two'], color='indigo')
plt.xlabel('Cost for Two')

# Determine table booking availability
sns.countplot(x='book_table', data=df, palette='viridis')
plt.title('Table Booking vs No Table Booking')
plt.ylabel('Number of Restaurants')
plt.show()