데이터 시각화 — Matplotlib과 Seaborn | 파란여우

환경 설정

import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd
import numpy as np

# 한글 폰트 설정 (Mac)
plt.rcParams["font.family"] = "AppleGothic"
plt.rcParams["axes.unicode_minus"] = False

# 스타일 설정
sns.set_theme(style="whitegrid", palette="husl")

기본 차트 (Matplotlib)

fig, axes = plt.subplots(2, 2, figsize=(12, 10))

# 선 그래프
months = range(1, 13)
sales = [120, 135, 148, 162, 175, 190, 185, 178, 165, 155, 145, 160]
axes[0, 0].plot(months, sales, marker="o", color="blue", linewidth=2)
axes[0, 0].set_title("월별 매출")
axes[0, 0].set_xlabel("월")
axes[0, 0].set_ylabel("매출 (만원)")

# 막대 그래프
categories = ["개발팀", "마케팅팀", "디자인팀", "영업팀"]
values = [45, 32, 28, 38]
bars = axes[0, 1].bar(categories, values, color=sns.color_palette("husl", 4))
axes[0, 1].set_title("팀별 인원")
# 막대 위에 값 표시
for bar, val in zip(bars, values):
    axes[0, 1].text(bar.get_x() + bar.get_width()/2, bar.get_height() + 0.5,
                    str(val), ha="center", va="bottom")

# 파이 차트
sizes = [35, 25, 20, 20]
explode = (0.05, 0, 0, 0)
axes[1, 0].pie(sizes, labels=categories, autopct="%1.1f%%",
               explode=explode, startangle=90)
axes[1, 0].set_title("팀별 비율")

# 히스토그램
data = np.random.normal(170, 10, 1000)
axes[1, 1].hist(data, bins=30, edgecolor="white", color="steelblue")
axes[1, 1].axvline(data.mean(), color="red", linestyle="--", label=f"평균: {data.mean():.1f}")
axes[1, 1].set_title("키 분포")
axes[1, 1].legend()

plt.tight_layout()
plt.savefig("charts.png", dpi=150, bbox_inches="tight")
plt.show()

산점도와 상관관계

# 샘플 데이터
np.random.seed(42)
n = 200
df = pd.DataFrame({
    "광고비": np.random.uniform(100, 1000, n),
    "매출": np.random.uniform(100, 1000, n),
    "지역": np.random.choice(["서울", "부산", "대구", "광주"], n),
})
# 상관관계 추가
df["매출"] = df["광고비"] * 0.8 + np.random.normal(0, 100, n)

# Seaborn 산점도
plt.figure(figsize=(10, 6))
sns.scatterplot(data=df, x="광고비", y="매출", hue="지역",
                size="매출", sizes=(20, 200), alpha=0.7)
sns.regplot(data=df, x="광고비", y="매출",
            scatter=False, color="red", label="추세선")
plt.title("광고비 vs 매출 (지역별)")
plt.legend()
plt.show()

분포 시각화

fig, axes = plt.subplots(1, 3, figsize=(15, 5))

# 박스플롯
sns.boxplot(data=df, x="지역", y="매출", ax=axes[0])
axes[0].set_title("지역별 매출 분포 (박스플롯)")

# 바이올린 플롯
sns.violinplot(data=df, x="지역", y="매출", ax=axes[1])
axes[1].set_title("지역별 매출 분포 (바이올린)")

# 히스토그램 + KDE
sns.histplot(data=df, x="매출", kde=True, ax=axes[2], color="steelblue")
axes[2].set_title("매출 전체 분포")

plt.tight_layout()

히트맵: 상관관계 행렬

# 수치형 데이터 상관관계
corr_df = pd.DataFrame({
    "온도": np.random.normal(25, 5, 100),
    "아이스크림 판매": None,
    "핫초코 판매": None,
    "우산 판매": None,
})
corr_df["비"] = np.random.choice([0, 1], 100, p=[0.7, 0.3])
corr_df["아이스크림 판매"] = corr_df["온도"] * 10 + np.random.normal(0, 20, 100)
corr_df["핫초코 판매"] = -corr_df["온도"] * 8 + np.random.normal(200, 30, 100)
corr_df["우산 판매"] = corr_df["비"] * 100 + np.random.normal(10, 20, 100)

corr_matrix = corr_df.corr()

plt.figure(figsize=(8, 6))
sns.heatmap(corr_matrix, annot=True, fmt=".2f", cmap="RdYlGn",
            vmin=-1, vmax=1, center=0,
            square=True, linewidths=0.5)
plt.title("변수 간 상관관계")

시계열 시각화

# 날짜 범위 생성
dates = pd.date_range("2026-01-01", periods=90, freq="D")
df_time = pd.DataFrame({
    "날짜": dates,
    "방문자": np.random.poisson(1000, 90) + np.sin(np.arange(90) * 2*np.pi/7) * 200,
})

plt.figure(figsize=(14, 5))
plt.plot(df_time["날짜"], df_time["방문자"], alpha=0.7)
# 이동 평균
rolling_avg = df_time["방문자"].rolling(7).mean()
plt.plot(df_time["날짜"], rolling_avg, color="red", linewidth=2, label="7일 이동평균")
plt.title("일별 방문자 수")
plt.legend()
plt.xticks(rotation=45)

정리

차트	적합한 상황
선 그래프	시간에 따른 변화
막대 그래프	범주별 비교
히스토그램	분포 확인
산점도	두 변수 관계
박스플롯	분포와 이상값
히트맵	상관관계 행렬

다음 편에서는 탐색적 데이터 분석(EDA) — 데이터에서 인사이트를 발굴하는 체계적인 방법을 배웁니다.

환경 설정

기본 차트 (Matplotlib)

산점도와 상관관계

분포 시각화

히트맵: 상관관계 행렬

시계열 시각화

정리

궁금한 점이 있으신가요?