머신러닝 기초 — scikit-learn으로 예측 모델 만들기

머신러닝 워크플로우

분류 문제 (타이타닉 생존 예측)

import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.pipeline import Pipeline

# 데이터 로드 및 전처리
df = pd.read_csv("titanic.csv")

# 특성 엔지니어링
df["FamilySize"] = df["SibSp"] + df["Parch"] + 1
df["IsAlone"] = (df["FamilySize"] == 1).astype(int)
df["Title"] = df["Name"].str.extract("([A-Za-z]+)\.")
df["Title"] = df["Title"].replace(["Lady", "Countess", "Capt", "Col", "Don",
                                    "Dr", "Major", "Rev", "Sir", "Jonkheer", "Dona"], "Rare")

# 결측값 처리
df["Age"].fillna(df["Age"].median(), inplace=True)
df["Embarked"].fillna(df["Embarked"].mode()[0], inplace=True)

# 인코딩
df["Sex"] = LabelEncoder().fit_transform(df["Sex"])
df = pd.get_dummies(df, columns=["Embarked", "Title"], drop_first=True)

# 특성 선택
features = ["Pclass", "Sex", "Age", "Fare", "FamilySize", "IsAlone"] + \
           [c for c in df.columns if c.startswith("Embarked_") or c.startswith("Title_")]
X = df[features]
y = df["Survived"]

# 학습/테스트 분할
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

모델 비교

models = {
    "Logistic Regression": LogisticRegression(max_iter=1000),
    "Random Forest": RandomForestClassifier(n_estimators=100, random_state=42),
    "Gradient Boosting": GradientBoostingClassifier(random_state=42),
}

results = {}
for name, model in models.items():
    # 교차 검증 (5-fold)
    scores = cross_val_score(model, X_train, y_train, cv=5, scoring="accuracy")
    results[name] = {
        "mean": scores.mean(),
        "std": scores.std(),
    }
    print(f"{name}: {scores.mean():.3f} ± {scores.std():.3f}")

# 최적 모델로 학습
best_model = RandomForestClassifier(n_estimators=100, random_state=42)
best_model.fit(X_train, y_train)

# 테스트 평가
y_pred = best_model.predict(X_test)
print("\n분류 보고서:")
print(classification_report(y_test, y_pred, target_names=["사망", "생존"]))

특성 중요도

feature_importance = pd.DataFrame({
    "feature": features,
    "importance": best_model.feature_importances_,
}).sort_values("importance", ascending=False)

import matplotlib.pyplot as plt
plt.figure(figsize=(10, 6))
plt.barh(feature_importance["feature"][:10], feature_importance["importance"][:10])
plt.title("특성 중요도 (상위 10개)")
plt.gca().invert_yaxis()

Pipeline: 전처리 + 모델 통합

from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer

# 전처리 파이프라인
numeric_features = ["Age", "Fare", "FamilySize"]
categorical_features = ["Embarked"]

numeric_transformer = Pipeline([
    ("imputer", SimpleImputer(strategy="median")),
    ("scaler", StandardScaler()),
])
categorical_transformer = Pipeline([
    ("imputer", SimpleImputer(strategy="most_frequent")),
    ("onehot", OneHotEncoder(handle_unknown="ignore")),
])

preprocessor = ColumnTransformer([
    ("num", numeric_transformer, numeric_features),
    ("cat", categorical_transformer, categorical_features),
])

# 전처리 + 모델 파이프라인
clf = Pipeline([
    ("preprocessor", preprocessor),
    ("classifier", RandomForestClassifier(n_estimators=100, random_state=42)),
])

clf.fit(X_train_raw, y_train)

회귀 문제 (주택 가격 예측)

from sklearn.datasets import fetch_california_housing
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.metrics import mean_squared_error, r2_score

housing = fetch_california_housing(as_frame=True)
X, y = housing.data, housing.target

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

model = GradientBoostingRegressor(n_estimators=100, random_state=42)
model.fit(X_train, y_train)

y_pred = model.predict(X_test)
rmse = mean_squared_error(y_test, y_pred, squared=False)
r2 = r2_score(y_test, y_pred)
print(f"RMSE: {rmse:.4f}")
print(f"R²: {r2:.4f}")

정리

개념	설명
교차 검증	과적합 방지, 일반화 성능 측정
Pipeline	전처리-모델 흐름 통합
특성 중요도	어떤 변수가 예측에 중요한가
R²	회귀 모델 설명력 (1에 가까울수록 좋음)

다음 편에서는 AI 연계 분석 — LLM과 데이터 분석을 결합해 자연어로 데이터를 탐색하는 방법을 배웁니다.

머신러닝 워크플로우

분류 문제 (타이타닉 생존 예측)

모델 비교

특성 중요도

Pipeline: 전처리 + 모델 통합

회귀 문제 (주택 가격 예측)

정리

궁금한 점이 있으신가요?