import pandas as pd
import numpy as np 
import matplotlib as plt


# 1. Load the CSV dataset and display the first 10 rows
data = pd.read_csv(r"C:\Users\USER\OneDrive\Desktop\project\sales_data.csv")
print("First 10 rows of the dataset:")
print(data.head(10))


# 2. Check for missing values and handle them appropriately
print("\nMissing values per column before cleaning:")
print(data.isnull().sum())


# Drop rows with missing values
data.dropna(inplace=True)


# Drop duplicate rows
data.drop_duplicates(inplace=True)


print("\nMissing values after cleaning:")
print(data.isnull().sum())


# 3. Calculate total amount spent by each customer
customer_total_spent = data.groupby('Customer_ID')['Total_Amount'].sum().reset_index()
print("\nTotal amount spent by each customer:")
print(customer_total_spent.head())


# 4. Filter transactions where payment method is 'Credit Card'
credit_card_transactions = data[data['Payment_Method'] == 'Credit Card']
print("\nCredit Card transactions:")
print(credit_card_transactions.head())


# 5. Create a new column for customer age groups
bins = [0, 17, 25, 35, 45, 60, 100]
labels = ['<18', '18-25', '26-35', '36-45', '46-60', '60+']
data['Age_Group'] = pd.cut(data['Age'], bins=bins, labels=labels, right=True)


print("\nAge groups added:")
print(data[['Age', 'Age_Group']].head())


# 6. Calculate total purchases per product category
total_purchases_per_product_category = data.groupby("Product_Category")['Total_Purchases'].sum()
print("\nTotal purchases per product category:")
print(total_purchases_per_product_category)


# 7. Analyze gender-wise distribution of customers
data['Gender'] = data['Gender'].astype(str).str.strip().str.capitalize()
gender_distribution = data['Gender'].value_counts()
print("\nGender-wise distribution of customers:")
print(gender_distribution)


# 8. Calculate correlation between Total_Purchases and amount spent
correlation = data['Total_Purchases'].corr(data['Total_Amount'])
print("\nCorrelation between Total_Purchases and Total_Amount Spent:", round(correlation, 3))


# 9. Identify top 10 customers based on amount spent
customer_spending = data.groupby('Customer_ID')['Total_Amount'].sum()
top_10_customers = customer_spending.sort_values(ascending=False).head(10)
print("\nTop 10 customers by amount spent:")
print(top_10_customers)


# 10. Group data by shipping method and calculate average Total_Purchases
avg_total_purchases = data.groupby('Shipping_Method')['Total_Purchases'].mean()
print("\nAverage Total_Purchases by shipping method:")
print(avg_total_purchases)


# 11. Find the most popular product brand by sales volume
popular_brand = data.groupby('Product_Brand')['Total_Purchases'].max()
print("\nMost popular product brand by sales volume:")
print(popular_brand)


# 12. Create a pivot table showing purchases by customer segment and product type
pivot_table = pd.pivot_table(
    data,
    values='Total_Amount',
    index='Customer_Segment',
    columns='Product_Type',
    aggfunc='sum',
    fill_value=0
)
print("\nPivot table: Total_Amount by Customer Segment and Product Type:")
print(pivot_table)


# 14. Detect outliers in amount spent using IQR
Q1 = data['Total_Amount'].quantile(0.25)
Q3 = data['Total_Amount'].quantile(0.75)
IQR = Q3 - Q1
lower_bound = Q1 - 1.5 * IQR
upper_bound = Q3 + 1.5 * IQR


outliers = data[(data['Total_Amount'] < lower_bound) | (data['Total_Amount'] > upper_bound)]
print("\nDetected outliers in Total_Amount:")
print(outliers)


# 15. Generate summary statistics for all numerical columns
print("\nSummary statistics:")
print(data.describe())


# 16. Export cleaned and transformed dataset to a new CSV file
data.to_csv('cleaned_dataset.csv', index=False)
print("\nCleaned dataset saved as 'cleaned_dataset.csv'")
      

๐Ÿงพ About This Code

This Python script performs end-to-end data preprocessing and exploratory data analysis (EDA) on a sales dataset using pandas, with support from NumPy and Matplotlib (though Matplotlib is imported but not used).

๐Ÿ“‹ Major Steps Covered

  1. Loading the dataset
  2. Handling missing and duplicate data
  3. Feature engineering
  4. Grouping and summarizing
  5. Outlier detection
  6. Exporting the cleaned data

๐Ÿง  Key pandas Concepts Demonstrated

  1. pd.read_csv() โ€“ Load your dataset from a CSV file.
  2. data.dropna() & drop_duplicates() โ€“ Remove null and duplicate records.
  3. groupby() โ€“ Aggregate data by customer or category.
  4. data[data['Column'] == 'Value'] โ€“ Filter rows based on condition.
  5. pd.cut() โ€“ Create bins (age groups) from continuous data.
  6. value_counts() โ€“ Show distribution counts (e.g., gender-wise).
  7. corr() โ€“ Calculate correlation between two numeric columns.
  8. pivot_table() โ€“ Generate summary tables based on multiple categories.
  9. IQR-based Outlier Detection โ€“ Use statistical range to detect outliers.
  10. data.describe() โ€“ Show statistical summary of numeric fields.
  11. data.to_csv() โ€“ Export cleaned data to a new CSV file.

๐Ÿ› ๏ธ Noteworthy Techniques Used

๐Ÿ“Œ Suggested Enhancements

Idea Benefit
Use Matplotlib or Seaborn Provides visual insights into patterns or distributions
Visualize outliers with boxplots Helps quickly identify anomalies
Encode categorical features Makes data ML-ready
Split code into functions Improves modularity and reuse
Add try-except blocks Handles file errors gracefully

๐Ÿผ Why Use Pandas?

Back to Home