InĀ [13]:
import pandas as pd
import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)
import matplotlib.pyplot as plt
import seaborn as sns
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"
# Read the dataset
data = pd.read_csv('sales_data_sample.csv', encoding='ISO-8859-1')
print('Data before processing')
print(data.info())
# 1. Convert the ORDERDATE column to a proper datetime format
data['ORDERDATE'] = pd.to_datetime(data['ORDERDATE'], errors='coerce')
# 2. Handle missing values:
# Fill missing values for 'ADDRESSLINE2' with empty string
data['ADDRESSLINE2'].fillna('', inplace=True)
# Fill missing values for 'STATE', 'POSTALCODE', and 'TERRITORY' with 'Unknown'
data['STATE'].fillna('Unknown', inplace=True)
data['POSTALCODE'].fillna('Unknown', inplace=True)
data['TERRITORY'].fillna('Unknown', inplace=True)
print('Data after processing')
data.info()
Data before processing <class 'pandas.core.frame.DataFrame'> RangeIndex: 2823 entries, 0 to 2822 Data columns (total 25 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 ORDERNUMBER 2823 non-null int64 1 QUANTITYORDERED 2823 non-null int64 2 PRICEEACH 2823 non-null float64 3 ORDERLINENUMBER 2823 non-null int64 4 SALES 2823 non-null float64 5 ORDERDATE 2823 non-null object 6 STATUS 2823 non-null object 7 QTR_ID 2823 non-null int64 8 MONTH_ID 2823 non-null int64 9 YEAR_ID 2823 non-null int64 10 PRODUCTLINE 2823 non-null object 11 MSRP 2823 non-null int64 12 PRODUCTCODE 2823 non-null object 13 CUSTOMERNAME 2823 non-null object 14 PHONE 2823 non-null object 15 ADDRESSLINE1 2823 non-null object 16 ADDRESSLINE2 302 non-null object 17 CITY 2823 non-null object 18 STATE 1337 non-null object 19 POSTALCODE 2747 non-null object 20 COUNTRY 2823 non-null object 21 TERRITORY 1749 non-null object 22 CONTACTLASTNAME 2823 non-null object 23 CONTACTFIRSTNAME 2823 non-null object 24 DEALSIZE 2823 non-null object dtypes: float64(2), int64(7), object(16) memory usage: 551.5+ KB None Data after processing <class 'pandas.core.frame.DataFrame'> RangeIndex: 2823 entries, 0 to 2822 Data columns (total 25 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 ORDERNUMBER 2823 non-null int64 1 QUANTITYORDERED 2823 non-null int64 2 PRICEEACH 2823 non-null float64 3 ORDERLINENUMBER 2823 non-null int64 4 SALES 2823 non-null float64 5 ORDERDATE 2823 non-null datetime64[ns] 6 STATUS 2823 non-null object 7 QTR_ID 2823 non-null int64 8 MONTH_ID 2823 non-null int64 9 YEAR_ID 2823 non-null int64 10 PRODUCTLINE 2823 non-null object 11 MSRP 2823 non-null int64 12 PRODUCTCODE 2823 non-null object 13 CUSTOMERNAME 2823 non-null object 14 PHONE 2823 non-null object 15 ADDRESSLINE1 2823 non-null object 16 ADDRESSLINE2 2823 non-null object 17 CITY 2823 non-null object 18 STATE 2823 non-null object 19 POSTALCODE 2823 non-null object 20 COUNTRY 2823 non-null object 21 TERRITORY 2823 non-null object 22 CONTACTLASTNAME 2823 non-null object 23 CONTACTFIRSTNAME 2823 non-null object 24 DEALSIZE 2823 non-null object dtypes: datetime64[ns](1), float64(2), int64(7), object(15) memory usage: 551.5+ KB
InĀ [14]:
# Calculate total revenue for each product
data['Revenue'] = data['QUANTITYORDERED'] * data['PRICEEACH']
product_metrics = data.groupby('PRODUCTCODE').agg({
'Revenue': 'sum',
'QUANTITYORDERED': 'sum',
'PRICEEACH': 'mean',
'ORDERNUMBER': 'count' # Total number of orders
}).rename(columns={
'Revenue': 'Total Revenue',
'QUANTITYORDERED': 'Total Units Sold',
'PRICEEACH': 'Average Price per Unit',
'ORDERNUMBER': 'Total Orders'
}).reset_index()
# Rank products based on these metrics
product_metrics = product_metrics.sort_values(by='Total Revenue', ascending=False)
print(product_metrics)
PRODUCTCODE Total Revenue Total Units Sold Average Price per Unit \
39 S18_3232 176026.63 1774 99.045000
76 S24_3856 103489.89 1052 98.068519
50 S18_4600 101835.00 1031 98.374444
63 S24_2300 99600.00 996 100.000000
25 S18_2238 96300.00 966 99.555556
.. ... ... ... ...
62 S24_2022 42524.98 851 50.862400
69 S24_2972 42145.94 912 45.498519
85 S32_2206 39237.71 836 47.084000
59 S24_1937 37392.38 844 43.992000
78 S24_3969 33181.66 745 45.046818
Total Orders
39 52
76 27
50 27
63 27
25 27
.. ...
62 25
69 27
85 25
59 25
78 22
[109 rows x 5 columns]
InĀ [15]:
# Convert 'ORDERDATE' to datetime
data['ORDERDATE'] = pd.to_datetime(data['ORDERDATE'])
# Extract the month
data['Month'] = data['ORDERDATE'].dt.month.astype(str).str.zfill(2)
data = data.sort_values(by='Month')
monthly_metrics = data.groupby('Month').agg({
'Revenue': 'sum',
'QUANTITYORDERED': 'sum',
'PRICEEACH': 'mean'
}).rename(columns={
'Revenue': 'Total Revenue',
'QUANTITYORDERED': 'Total Units Sold',
'PRICEEACH': 'Average Price per Unit'
}).reset_index()
# Plot metrics over time
plt.figure(figsize=(12, 6))
sns.lineplot(data=monthly_metrics, x='Month', y='Total Revenue', marker='o', label='Total Revenue')
sns.lineplot(data=monthly_metrics, x='Month', y='Total Units Sold', marker='o', label='Total Units Sold')
plt.title('Monthly Sales Metrics')
plt.xlabel('Month-Year')
plt.ylabel('Value')
plt.legend()
plt.show()
Out[15]:
<Figure size 1200x600 with 0 Axes>
Out[15]:
<Axes: xlabel='Month', ylabel='Total Revenue'>
Out[15]:
<Axes: xlabel='Month', ylabel='Total Revenue'>
Out[15]:
Text(0.5, 1.0, 'Monthly Sales Metrics')
Out[15]:
Text(0.5, 0, 'Month-Year')
Out[15]:
Text(0, 0.5, 'Value')
Out[15]:
<matplotlib.legend.Legend at 0x2c6f5dee750>
InĀ [16]:
# Calculate sales by city
city_sales = data.groupby('CITY').agg({
'Revenue': 'sum'
}).rename(columns={'Revenue': 'Total Revenue'}).reset_index()
# Sort and identify top 5 cities
top_cities = city_sales.sort_values(by='Total Revenue', ascending=False).head(5)
print(top_cities)
# Plot top 5 cities
plt.figure(figsize=(10, 5))
sns.barplot(data=top_cities, x='CITY', y='Total Revenue')
plt.title('Top 5 Cities by Sales')
plt.xlabel('CITY')
plt.ylabel('Total Revenue')
plt.xticks(rotation=45)
plt.show()
CITY Total Revenue 34 Madrid 902094.19 61 San Rafael 530587.19 42 NYC 450418.63 63 Singapore 227985.50 52 Paris 212959.82
Out[16]:
<Figure size 1000x500 with 0 Axes>
Out[16]:
<Axes: xlabel='CITY', ylabel='Total Revenue'>
Out[16]:
Text(0.5, 1.0, 'Top 5 Cities by Sales')
Out[16]:
Text(0.5, 0, 'CITY')
Out[16]:
Text(0, 0.5, 'Total Revenue')
Out[16]:
([0, 1, 2, 3, 4], [Text(0, 0, 'Madrid'), Text(1, 0, 'San Rafael'), Text(2, 0, 'NYC'), Text(3, 0, 'Singapore'), Text(4, 0, 'Paris')])