import pandas as pd
import numpy as np 
import matplotlib as plt


# 1. Load the CSV dataset and display the first 10 rows
data = pd.read_csv(r"C:\Users\USER\OneDrive\Desktop\project\sales_data.csv")
print("First 10 rows of the dataset:")
print(data.head(10))


# 2. Check for missing values and handle them appropriately
print("\nMissing values per column before cleaning:")
print(data.isnull().sum())


# Drop rows with missing values
data.dropna(inplace=True)


# Drop duplicate rows
data.drop_duplicates(inplace=True)


print("\nMissing values after cleaning:")
print(data.isnull().sum())


# 3. Calculate total amount spent by each customer
customer_total_spent = data.groupby('Customer_ID')['Total_Amount'].sum().reset_index()
print("\nTotal amount spent by each customer:")
print(customer_total_spent.head())


# 4. Filter transactions where payment method is 'Credit Card'
credit_card_transactions = data[data['Payment_Method'] == 'Credit Card']
print("\nCredit Card transactions:")
print(credit_card_transactions.head())


# 5. Create a new column for customer age groups
bins = [0, 17, 25, 35, 45, 60, 100]
labels = ['<18', '18-25', '26-35', '36-45', '46-60', '60+']
data['Age_Group'] = pd.cut(data['Age'], bins=bins, labels=labels, right=True)


print("\nAge groups added:")
print(data[['Age', 'Age_Group']].head())


# 6. Calculate total purchases per product category
total_purchases_per_product_category = data.groupby("Product_Category")['Total_Purchases'].sum()
print("\nTotal purchases per product category:")
print(total_purchases_per_product_category)


# 7. Analyze gender-wise distribution of customers
data['Gender'] = data['Gender'].astype(str).str.strip().str.capitalize()
gender_distribution = data['Gender'].value_counts()
print("\nGender-wise distribution of customers:")
print(gender_distribution)


# 8. Calculate correlation between Total_Purchases and amount spent
correlation = data['Total_Purchases'].corr(data['Total_Amount'])
print("\nCorrelation between Total_Purchases and Total_Amount Spent:", round(correlation, 3))


# 9. Identify top 10 customers based on amount spent
customer_spending = data.groupby('Customer_ID')['Total_Amount'].sum()
top_10_customers = customer_spending.sort_values(ascending=False).head(10)
print("\nTop 10 customers by amount spent:")
print(top_10_customers)


# 10. Group data by shipping method and calculate average Total_Purchases
avg_total_purchases = data.groupby('Shipping_Method')['Total_Purchases'].mean()
print("\nAverage Total_Purchases by shipping method:")
print(avg_total_purchases)


# 11. Find the most popular product brand by sales volume
popular_brand = data.groupby('Product_Brand')['Total_Purchases'].max()
print("\nMost popular product brand by sales volume:")
print(popular_brand)


# 12. Create a pivot table showing purchases by customer segment and product type
pivot_table = pd.pivot_table(
    data,
    values='Total_Amount',
    index='Customer_Segment',
    columns='Product_Type',
    aggfunc='sum',
    fill_value=0
)
print("\nPivot table: Total_Amount by Customer Segment and Product Type:")
print(pivot_table)


# 14. Detect outliers in amount spent using IQR
Q1 = data['Total_Amount'].quantile(0.25)
Q3 = data['Total_Amount'].quantile(0.75)
IQR = Q3 - Q1
lower_bound = Q1 - 1.5 * IQR
upper_bound = Q3 + 1.5 * IQR


outliers = data[(data['Total_Amount'] < lower_bound) | (data['Total_Amount'] > upper_bound)]
print("\nDetected outliers in Total_Amount:")
print(outliers)


# 15. Generate summary statistics for all numerical columns
print("\nSummary statistics:")
print(data.describe())


# 16. Export cleaned and transformed dataset to a new CSV file
data.to_csv('cleaned_dataset.csv', index=False)
print("\nCleaned dataset saved as 'cleaned_dataset.csv'")

🧾 About This Code

This Python script performs end-to-end data preprocessing and exploratory data analysis (EDA) on a sales dataset using pandas, with support from NumPy and Matplotlib (though Matplotlib is imported but not used).

📋 Major Steps Covered

Loading the dataset
Handling missing and duplicate data
Feature engineering
Grouping and summarizing
Outlier detection
Exporting the cleaned data

🧠 Key `pandas` Concepts Demonstrated

pd.read_csv() – Load your dataset from a CSV file.
data.dropna() & drop_duplicates() – Remove null and duplicate records.
groupby() – Aggregate data by customer or category.
data[data['Column'] == 'Value'] – Filter rows based on condition.
pd.cut() – Create bins (age groups) from continuous data.
value_counts() – Show distribution counts (e.g., gender-wise).
corr() – Calculate correlation between two numeric columns.
pivot_table() – Generate summary tables based on multiple categories.
IQR-based Outlier Detection – Use statistical range to detect outliers.
data.describe() – Show statistical summary of numeric fields.
data.to_csv() – Export cleaned data to a new CSV file.

🛠️ Noteworthy Techniques Used

Missing value handling: dropna() quickly removes null records.
String transformation: str.strip().str.capitalize() ensures consistent values like gender.
Customer segmentation: pd.cut() creates age-based marketing groups.
Outlier detection: Uses the IQR method for robust anomaly detection.
Pivot tables: Provide business insights by summarizing sales across segments and products.

📌 Suggested Enhancements

Idea	Benefit
Use Matplotlib or Seaborn	Provides visual insights into patterns or distributions
Visualize outliers with boxplots	Helps quickly identify anomalies
Encode categorical features	Makes data ML-ready
Split code into functions	Improves modularity and reuse
Add `try-except` blocks	Handles file errors gracefully

🐼 Why Use Pandas?

Efficient & fast: Excellent for large tabular data.
Readable: Code closely resembles natural language.
Flexible: Offers merging, reshaping, time-series support.
Integrations: Works seamlessly with NumPy, Matplotlib, and Scikit-learn.