import pandas as pd
import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)
import matplotlib.pyplot as plt
import seaborn as sns
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"

# Read the dataset
data = pd.read_csv('sales_data_sample.csv', encoding='ISO-8859-1') 

print('Data before processing')
print(data.info())

# 1. Convert the ORDERDATE column to a proper datetime format
data['ORDERDATE'] = pd.to_datetime(data['ORDERDATE'], errors='coerce')

# 2. Handle missing values:

# Fill missing values for 'ADDRESSLINE2' with empty string
data['ADDRESSLINE2'].fillna('', inplace=True)

# Fill missing values for 'STATE', 'POSTALCODE', and 'TERRITORY' with 'Unknown'
data['STATE'].fillna('Unknown', inplace=True)
data['POSTALCODE'].fillna('Unknown', inplace=True)
data['TERRITORY'].fillna('Unknown', inplace=True)


print('Data after processing')
data.info()

Data before processing
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2823 entries, 0 to 2822
Data columns (total 25 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   ORDERNUMBER       2823 non-null   int64  
 1   QUANTITYORDERED   2823 non-null   int64  
 2   PRICEEACH         2823 non-null   float64
 3   ORDERLINENUMBER   2823 non-null   int64  
 4   SALES             2823 non-null   float64
 5   ORDERDATE         2823 non-null   object 
 6   STATUS            2823 non-null   object 
 7   QTR_ID            2823 non-null   int64  
 8   MONTH_ID          2823 non-null   int64  
 9   YEAR_ID           2823 non-null   int64  
 10  PRODUCTLINE       2823 non-null   object 
 11  MSRP              2823 non-null   int64  
 12  PRODUCTCODE       2823 non-null   object 
 13  CUSTOMERNAME      2823 non-null   object 
 14  PHONE             2823 non-null   object 
 15  ADDRESSLINE1      2823 non-null   object 
 16  ADDRESSLINE2      302 non-null    object 
 17  CITY              2823 non-null   object 
 18  STATE             1337 non-null   object 
 19  POSTALCODE        2747 non-null   object 
 20  COUNTRY           2823 non-null   object 
 21  TERRITORY         1749 non-null   object 
 22  CONTACTLASTNAME   2823 non-null   object 
 23  CONTACTFIRSTNAME  2823 non-null   object 
 24  DEALSIZE          2823 non-null   object 
dtypes: float64(2), int64(7), object(16)
memory usage: 551.5+ KB
None
Data after processing
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2823 entries, 0 to 2822
Data columns (total 25 columns):
 #   Column            Non-Null Count  Dtype         
---  ------            --------------  -----         
 0   ORDERNUMBER       2823 non-null   int64         
 1   QUANTITYORDERED   2823 non-null   int64         
 2   PRICEEACH         2823 non-null   float64       
 3   ORDERLINENUMBER   2823 non-null   int64         
 4   SALES             2823 non-null   float64       
 5   ORDERDATE         2823 non-null   datetime64[ns]
 6   STATUS            2823 non-null   object        
 7   QTR_ID            2823 non-null   int64         
 8   MONTH_ID          2823 non-null   int64         
 9   YEAR_ID           2823 non-null   int64         
 10  PRODUCTLINE       2823 non-null   object        
 11  MSRP              2823 non-null   int64         
 12  PRODUCTCODE       2823 non-null   object        
 13  CUSTOMERNAME      2823 non-null   object        
 14  PHONE             2823 non-null   object        
 15  ADDRESSLINE1      2823 non-null   object        
 16  ADDRESSLINE2      2823 non-null   object        
 17  CITY              2823 non-null   object        
 18  STATE             2823 non-null   object        
 19  POSTALCODE        2823 non-null   object        
 20  COUNTRY           2823 non-null   object        
 21  TERRITORY         2823 non-null   object        
 22  CONTACTLASTNAME   2823 non-null   object        
 23  CONTACTFIRSTNAME  2823 non-null   object        
 24  DEALSIZE          2823 non-null   object        
dtypes: datetime64[ns](1), float64(2), int64(7), object(15)
memory usage: 551.5+ KB

# Calculate total revenue for each product
data['Revenue'] = data['QUANTITYORDERED'] * data['PRICEEACH']
product_metrics = data.groupby('PRODUCTCODE').agg({
    'Revenue': 'sum',
    'QUANTITYORDERED': 'sum',
    'PRICEEACH': 'mean',
    'ORDERNUMBER': 'count'  # Total number of orders
}).rename(columns={
    'Revenue': 'Total Revenue',
    'QUANTITYORDERED': 'Total Units Sold',
    'PRICEEACH': 'Average Price per Unit',
    'ORDERNUMBER': 'Total Orders'
}).reset_index()

# Rank products based on these metrics
product_metrics = product_metrics.sort_values(by='Total Revenue', ascending=False)
print(product_metrics)

   PRODUCTCODE  Total Revenue  Total Units Sold  Average Price per Unit  \
39    S18_3232      176026.63              1774               99.045000   
76    S24_3856      103489.89              1052               98.068519   
50    S18_4600      101835.00              1031               98.374444   
63    S24_2300       99600.00               996              100.000000   
25    S18_2238       96300.00               966               99.555556   
..         ...            ...               ...                     ...   
62    S24_2022       42524.98               851               50.862400   
69    S24_2972       42145.94               912               45.498519   
85    S32_2206       39237.71               836               47.084000   
59    S24_1937       37392.38               844               43.992000   
78    S24_3969       33181.66               745               45.046818   

    Total Orders  
39            52  
76            27  
50            27  
63            27  
25            27  
..           ...  
62            25  
69            27  
85            25  
59            25  
78            22  

[109 rows x 5 columns]

# Convert 'ORDERDATE' to datetime
data['ORDERDATE'] = pd.to_datetime(data['ORDERDATE'])
# Extract the month
data['Month'] = data['ORDERDATE'].dt.month.astype(str).str.zfill(2)
data = data.sort_values(by='Month')
monthly_metrics = data.groupby('Month').agg({
    'Revenue': 'sum',
    'QUANTITYORDERED': 'sum',
    'PRICEEACH': 'mean'
}).rename(columns={
    'Revenue': 'Total Revenue',
    'QUANTITYORDERED': 'Total Units Sold',
    'PRICEEACH': 'Average Price per Unit'
}).reset_index()


# Plot metrics over time
plt.figure(figsize=(12, 6))
sns.lineplot(data=monthly_metrics, x='Month', y='Total Revenue', marker='o', label='Total Revenue')
sns.lineplot(data=monthly_metrics, x='Month', y='Total Units Sold', marker='o', label='Total Units Sold')
plt.title('Monthly Sales Metrics')
plt.xlabel('Month-Year')
plt.ylabel('Value')
plt.legend()
plt.show()

<Figure size 1200x600 with 0 Axes>

<Axes: xlabel='Month', ylabel='Total Revenue'>

<Axes: xlabel='Month', ylabel='Total Revenue'>

Text(0.5, 1.0, 'Monthly Sales Metrics')

Text(0.5, 0, 'Month-Year')

Text(0, 0.5, 'Value')

<matplotlib.legend.Legend at 0x2c6f5dee750>

# Calculate sales by city
city_sales = data.groupby('CITY').agg({
    'Revenue': 'sum'
}).rename(columns={'Revenue': 'Total Revenue'}).reset_index()

# Sort and identify top 5 cities
top_cities = city_sales.sort_values(by='Total Revenue', ascending=False).head(5)
print(top_cities)

# Plot top 5 cities
plt.figure(figsize=(10, 5))
sns.barplot(data=top_cities, x='CITY', y='Total Revenue')
plt.title('Top 5 Cities by Sales')
plt.xlabel('CITY')
plt.ylabel('Total Revenue')
plt.xticks(rotation=45)
plt.show()

          CITY  Total Revenue
34      Madrid      902094.19
61  San Rafael      530587.19
42         NYC      450418.63
63   Singapore      227985.50
52       Paris      212959.82

<Figure size 1000x500 with 0 Axes>

<Axes: xlabel='CITY', ylabel='Total Revenue'>

Text(0.5, 1.0, 'Top 5 Cities by Sales')

Text(0.5, 0, 'CITY')

Text(0, 0.5, 'Total Revenue')

([0, 1, 2, 3, 4],
 [Text(0, 0, 'Madrid'),
  Text(1, 0, 'San Rafael'),
  Text(2, 0, 'NYC'),
  Text(3, 0, 'Singapore'),
  Text(4, 0, 'Paris')])

Data Analysis Using Python