실전 프로젝트 — 이커머스 데이터 분석 대시보드

프로젝트 전체 구조

1. 데이터 생성 및 로드

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier, GradientBoostingRegressor
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import classification_report, r2_score
from sklearn.pipeline import Pipeline

np.random.seed(42)
plt.rcParams["font.family"] = "AppleGothic"
plt.rcParams["axes.unicode_minus"] = False

# 샘플 이커머스 데이터 생성
n_orders = 5000
n_customers = 1000

customers = pd.DataFrame({
    "customer_id": range(1, n_customers + 1),
    "가입일": pd.date_range("2023-01-01", periods=n_customers, freq="8h"),
    "지역": np.random.choice(["서울", "경기", "부산", "대구", "광주"], n_customers,
                              p=[0.35, 0.30, 0.15, 0.10, 0.10]),
    "나이": np.random.randint(20, 65, n_customers),
    "성별": np.random.choice(["남", "여"], n_customers),
})

orders = pd.DataFrame({
    "order_id": range(1, n_orders + 1),
    "customer_id": np.random.randint(1, n_customers + 1, n_orders),
    "주문일": pd.to_datetime("2024-01-01") + pd.to_timedelta(
        np.random.randint(0, 365, n_orders), unit="D"
    ),
    "카테고리": np.random.choice(["전자제품", "의류", "식품", "도서", "스포츠"], n_orders,
                                  p=[0.25, 0.30, 0.20, 0.10, 0.15]),
    "금액": np.random.lognormal(10, 1, n_orders).astype(int),
    "수량": np.random.randint(1, 6, n_orders),
    "할인율": np.random.choice([0, 0.05, 0.10, 0.20, 0.30], n_orders,
                               p=[0.40, 0.25, 0.20, 0.10, 0.05]),
    "반품여부": np.random.choice([0, 1], n_orders, p=[0.92, 0.08]),
})

orders["실결제금액"] = (orders["금액"] * (1 - orders["할인율"])).astype(int)

# 데이터 병합
df = orders.merge(customers, on="customer_id", how="left")
print(f"데이터 크기: {df.shape}")
print(df.head())

2. 데이터 정제

def clean_ecommerce_data(df: pd.DataFrame) -> pd.DataFrame:
    df = df.copy()

    # 날짜 타입 확인
    df["주문일"] = pd.to_datetime(df["주문일"])
    df["가입일"] = pd.to_datetime(df["가입일"])

    # 파생 변수
    df["주문_월"] = df["주문일"].dt.month
    df["주문_요일"] = df["주문일"].dt.day_name()
    df["주문_분기"] = df["주문일"].dt.quarter
    df["고객_나이대"] = pd.cut(df["나이"], bins=[0, 30, 40, 50, 100],
                              labels=["20대", "30대", "40대", "50대이상"])

    # 결측값 처리
    df["나이"].fillna(df["나이"].median(), inplace=True)
    df["지역"].fillna("기타", inplace=True)

    # 이상값 클리핑
    Q1, Q3 = df["실결제금액"].quantile([0.01, 0.99])
    df["실결제금액"] = df["실결제금액"].clip(Q1, Q3)

    return df

df = clean_ecommerce_data(df)
print("정제 완료")
print(df.isnull().sum())

3. EDA: 매출 분석

fig, axes = plt.subplots(2, 3, figsize=(18, 12))
fig.suptitle("이커머스 매출 분석 대시보드", fontsize=16, fontweight="bold")

# 월별 매출 트렌드
monthly = df.groupby("주문_월")["실결제금액"].sum() / 1e6
axes[0, 0].plot(monthly.index, monthly.values, marker="o", linewidth=2, color="steelblue")
axes[0, 0].fill_between(monthly.index, monthly.values, alpha=0.3)
axes[0, 0].set_title("월별 매출 (백만원)")
axes[0, 0].set_xlabel("월")

# 카테고리별 매출
cat_sales = df.groupby("카테고리")["실결제금액"].sum().sort_values(ascending=True)
axes[0, 1].barh(cat_sales.index, cat_sales.values / 1e6,
                color=sns.color_palette("husl", len(cat_sales)))
axes[0, 1].set_title("카테고리별 총 매출 (백만원)")

# 지역별 주문 수
region_count = df["지역"].value_counts()
axes[0, 2].pie(region_count.values, labels=region_count.index,
               autopct="%1.1f%%", startangle=90)
axes[0, 2].set_title("지역별 주문 비중")

# 결제금액 분포
df["실결제금액"].hist(bins=50, ax=axes[1, 0], color="steelblue", edgecolor="white")
axes[1, 0].axvline(df["실결제금액"].mean(), color="red", linestyle="--",
                   label=f"평균: {df['실결제금액'].mean():,.0f}원")
axes[1, 0].set_title("결제금액 분포")
axes[1, 0].legend()

# 요일별 주문
dow_order = ["Monday", "Tuesday", "Wednesday", "Thursday", "Friday", "Saturday", "Sunday"]
dow_labels = ["월", "화", "수", "목", "금", "토", "일"]
dow_count = df.groupby("주문_요일")["order_id"].count().reindex(dow_order)
axes[1, 1].bar(dow_labels, dow_count.values, color="coral")
axes[1, 1].set_title("요일별 주문 수")

# 할인율별 매출
discount_sales = df.groupby("할인율")["실결제금액"].mean()
axes[1, 2].bar([f"{int(r*100)}%" for r in discount_sales.index],
               discount_sales.values, color="mediumpurple")
axes[1, 2].set_title("할인율별 평균 결제금액")

plt.tight_layout()
plt.savefig("ecommerce_dashboard.png", dpi=150, bbox_inches="tight")

4. 고객 세그먼트 (RFM 분석)

from datetime import datetime

snapshot_date = df["주문일"].max() + pd.Timedelta(days=1)

rfm = df.groupby("customer_id").agg(
    Recency=("주문일", lambda x: (snapshot_date - x.max()).days),
    Frequency=("order_id", "count"),
    Monetary=("실결제금액", "sum"),
).reset_index()

# 점수화 (1~5)
for col in ["Recency", "Frequency", "Monetary"]:
    if col == "Recency":
        rfm[f"{col}_Score"] = pd.qcut(rfm[col], 5, labels=[5, 4, 3, 2, 1])
    else:
        rfm[f"{col}_Score"] = pd.qcut(rfm[col], 5, labels=[1, 2, 3, 4, 5])

rfm["RFM_Score"] = (rfm["Recency_Score"].astype(int)
                    + rfm["Frequency_Score"].astype(int)
                    + rfm["Monetary_Score"].astype(int))

# 세그먼트 분류
def classify_segment(score):
    if score >= 13:
        return "Champions"
    elif score >= 10:
        return "Loyal"
    elif score >= 7:
        return "At Risk"
    else:
        return "Lost"

rfm["Segment"] = rfm["RFM_Score"].apply(classify_segment)

segment_counts = rfm["Segment"].value_counts()
print(segment_counts)
print(f"\n고가치 고객(Champions) 비율: {(rfm['Segment']=='Champions').mean():.1%}")

5. 반품 예측 모델

# 특성 준비
feature_cols = ["금액", "수량", "할인율", "주문_월", "주문_분기", "나이"]

df_ml = df[feature_cols + ["반품여부"]].dropna()
X = df_ml[feature_cols]
y = df_ml["반품여부"]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

clf = Pipeline([
    ("scaler", StandardScaler()),
    ("model", RandomForestClassifier(n_estimators=100, random_state=42, class_weight="balanced")),
])

clf.fit(X_train, y_train)
y_pred = clf.predict(X_test)

print("반품 예측 모델 성능:")
print(classification_report(y_test, y_pred, target_names=["정상", "반품"]))

# 특성 중요도
importances = clf.named_steps["model"].feature_importances_
feat_imp = pd.Series(importances, index=feature_cols).sort_values(ascending=False)
print("\n특성 중요도:")
print(feat_imp)

6. 월별 매출 예측

# 월별 집계
monthly_data = df.groupby(["주문_월", "주문_분기"]).agg(
    총매출=("실결제금액", "sum"),
    주문수=("order_id", "count"),
    평균할인율=("할인율", "mean"),
    고유고객수=("customer_id", "nunique"),
).reset_index()

X_reg = monthly_data[["주문_월", "주문_분기", "주문수", "평균할인율", "고유고객수"]]
y_reg = monthly_data["총매출"]

X_train_r, X_test_r, y_train_r, y_test_r = train_test_split(
    X_reg, y_reg, test_size=0.25, random_state=42
)

reg = GradientBoostingRegressor(n_estimators=100, random_state=42)
reg.fit(X_train_r, y_train_r)

y_pred_r = reg.predict(X_test_r)
r2 = r2_score(y_test_r, y_pred_r)
print(f"매출 예측 R²: {r2:.3f}")

7. 자동 인사이트 리포트

def generate_summary(df: pd.DataFrame, rfm: pd.DataFrame) -> dict:
    total_revenue = df["실결제금액"].sum()
    best_month = df.groupby("주문_월")["실결제금액"].sum().idxmax()
    best_category = df.groupby("카테고리")["실결제금액"].sum().idxmax()
    return_rate = df["반품여부"].mean()
    champions_pct = (rfm["Segment"] == "Champions").mean()

    return {
        "총_매출": f"{total_revenue:,.0f}원",
        "최고_매출_월": f"{best_month}월",
        "최다_매출_카테고리": best_category,
        "반품률": f"{return_rate:.1%}",
        "우수_고객_비율": f"{champions_pct:.1%}",
        "총_고객수": rfm.shape[0],
        "총_주문수": df.shape[0],
    }

summary = generate_summary(df, rfm)
print("=== 이커머스 분석 요약 ===")
for k, v in summary.items():
    print(f"  {k}: {v}")

정리: 프로젝트 체크리스트

단계	완료 여부	핵심 포인트
데이터 로드 & 정제	✅	타입 변환, 결측값, 이상값
EDA & 시각화	✅	트렌드, 분포, 카테고리 비교
RFM 고객 세그먼트	✅	Recency·Frequency·Monetary
반품 예측 모델	✅	분류, 불균형 클래스 처리
매출 예측	✅	회귀, R² 평가
자동 인사이트 리포트	✅	핵심 지표 요약

Python 데이터 분석 시리즈를 완주했습니다. NumPy 기초부터 AI 연계 분석까지, 데이터 분석의 전체 흐름을 익혔습니다.

프로젝트 전체 구조

1. 데이터 생성 및 로드

2. 데이터 정제

3. EDA: 매출 분석

4. 고객 세그먼트 (RFM 분석)

5. 반품 예측 모델

6. 월별 매출 예측

7. 자동 인사이트 리포트

정리: 프로젝트 체크리스트

궁금한 점이 있으신가요?