Infrastructure as Code 모범 사례 가이드 2026

IaCTerraformPulumiAWS CDKDevOpsInfrastructureAutomationGitOps

Infrastructure as Code 모범 사례 가이드 2026

서론

Infrastructure as Code(IaC)는 현대 DevOps 환경에서 필수 요소가 되었습니다. 2026년 현재, 클라우드 네이티브 환경의 복잡성이 증가하면서 효율적인 인프라 관리를 위한 체계적인 접근이 더욱 중요해졌습니다.

1. IaC 도구 비교 분석

1.1 Terraform vs Pulumi vs AWS CDK

# Terraform - HCL 문법
resource "aws_instance" "web" {
  ami           = data.aws_ami.ubuntu.id
  instance_type = var.instance_type

  vpc_security_group_ids = [aws_security_group.web.id]
  subnet_id              = aws_subnet.public.id

  user_data = templatefile("${path.module}/user_data.sh", {
    database_url = aws_rds_instance.main.endpoint
  })

  tags = {
    Name        = "web-server"
    Environment = var.environment
  }
}
// Pulumi - TypeScript
import * as aws from "@pulumi/aws";
import * as pulumi from "@pulumi/pulumi";

const webServer = new aws.ec2.Instance("web", {
    ami: ubuntu.then(ami => ami.id),
    instanceType: config.require("instanceType"),
    vpcSecurityGroupIds: [webSecurityGroup.id],
    subnetId: publicSubnet.id,
    userData: pulumi.interpolate`#!/bin/bash
echo "Database URL: ${database.endpoint}" > /etc/database.conf
`,
    tags: {
        Name: "web-server",
        Environment: environment,
    },
});
// AWS CDK - TypeScript
import * as ec2 from '@aws-cdk/aws-ec2';
import * as rds from '@aws-cdk/aws-rds';

const instance = new ec2.Instance(this, 'WebServer', {
  instanceType: ec2.InstanceType.of(
    ec2.InstanceClass.T3,
    ec2.InstanceSize.MICRO
  ),
  machineImage: ec2.MachineImage.latestAmazonLinux(),
  vpc: vpc,
  securityGroup: webSecurityGroup,
  userData: ec2.UserData.custom(`
    echo "Database URL: ${database.instanceEndpoint.socketAddress}" > /etc/database.conf
  `),
});

1.2 도구별 특징 비교

특징TerraformPulumiAWS CDK
언어 지원HCLPython, TypeScript, Go, C#TypeScript, Python, Java, C#
상태 관리외부 상태 파일외부 상태 파일CloudFormation 스택
프로바이더 생태계매우 풍부증가 중AWS 전용
학습 곡선중간낮음 (기존 언어 활용)낮음 (AWS 경험자)
커뮤니티매우 활발성장 중AWS 주도

2. 모듈화 및 재사용 전략

2.1 Terraform 모듈 구조

modules/
├── networking/
│   ├── main.tf
│   ├── variables.tf
│   ├── outputs.tf
│   └── versions.tf
├── compute/
│   ├── ec2/
│   ├── ecs/
│   └── lambda/
├── data/
│   ├── rds/
│   ├── dynamodb/
│   └── s3/
└── security/
    ├── iam/
    ├── secrets/
    └── waf/

2.2 네트워킹 모듈 예제

# modules/networking/main.tf
resource "aws_vpc" "main" {
  cidr_block           = var.vpc_cidr
  enable_dns_hostnames = true
  enable_dns_support   = true

  tags = merge(var.common_tags, {
    Name = "${var.name_prefix}-vpc"
  })
}

resource "aws_subnet" "public" {
  count = length(var.public_subnet_cidrs)

  vpc_id                  = aws_vpc.main.id
  cidr_block              = var.public_subnet_cidrs[count.index]
  availability_zone       = data.aws_availability_zones.available.names[count.index]
  map_public_ip_on_launch = true

  tags = merge(var.common_tags, {
    Name = "${var.name_prefix}-public-${count.index + 1}"
    Type = "public"
  })
}

resource "aws_subnet" "private" {
  count = length(var.private_subnet_cidrs)

  vpc_id            = aws_vpc.main.id
  cidr_block        = var.private_subnet_cidrs[count.index]
  availability_zone = data.aws_availability_zones.available.names[count.index]

  tags = merge(var.common_tags, {
    Name = "${var.name_prefix}-private-${count.index + 1}"
    Type = "private"
  })
}

# NAT Gateway for private subnets
resource "aws_eip" "nat" {
  count = var.enable_nat_gateway ? length(var.public_subnet_cidrs) : 0

  domain = "vpc"
  depends_on = [aws_internet_gateway.main]

  tags = merge(var.common_tags, {
    Name = "${var.name_prefix}-nat-eip-${count.index + 1}"
  })
}

resource "aws_nat_gateway" "main" {
  count = var.enable_nat_gateway ? length(var.public_subnet_cidrs) : 0

  allocation_id = aws_eip.nat[count.index].id
  subnet_id     = aws_subnet.public[count.index].id

  tags = merge(var.common_tags, {
    Name = "${var.name_prefix}-nat-${count.index + 1}"
  })

  depends_on = [aws_internet_gateway.main]
}

2.3 모듈 사용 예제

# environments/production/main.tf
module "networking" {
  source = "../../modules/networking"

  name_prefix = "prod"
  vpc_cidr    = "10.0.0.0/16"

  public_subnet_cidrs  = ["10.0.1.0/24", "10.0.2.0/24", "10.0.3.0/24"]
  private_subnet_cidrs = ["10.0.11.0/24", "10.0.12.0/24", "10.0.13.0/24"]

  enable_nat_gateway = true

  common_tags = {
    Environment = "production"
    Project     = "web-app"
    Owner       = "platform-team"
  }
}

module "compute" {
  source = "../../modules/compute/ecs"

  cluster_name = "prod-web-cluster"
  vpc_id       = module.networking.vpc_id
  subnet_ids   = module.networking.private_subnet_ids

  common_tags = {
    Environment = "production"
    Project     = "web-app"
  }

  depends_on = [module.networking]
}

3. 상태 관리 및 백엔드 구성

3.1 Terraform Remote State

# backend.tf
terraform {
  backend "s3" {
    bucket         = "company-terraform-state"
    key            = "environments/production/terraform.tfstate"
    region         = "us-west-2"
    encrypt        = true
    dynamodb_table = "terraform-state-lock"
  }
}

# S3 버킷 및 DynamoDB 테이블 생성
resource "aws_s3_bucket" "terraform_state" {
  bucket = "company-terraform-state"
}

resource "aws_s3_bucket_versioning" "terraform_state" {
  bucket = aws_s3_bucket.terraform_state.id
  versioning_configuration {
    status = "Enabled"
  }
}

resource "aws_s3_bucket_server_side_encryption_configuration" "terraform_state" {
  bucket = aws_s3_bucket.terraform_state.id

  rule {
    apply_server_side_encryption_by_default {
      sse_algorithm = "AES256"
    }
  }
}

resource "aws_dynamodb_table" "terraform_state_lock" {
  name           = "terraform-state-lock"
  billing_mode   = "PAY_PER_REQUEST"
  hash_key       = "LockID"

  attribute {
    name = "LockID"
    type = "S"
  }

  tags = {
    Name = "TerraformStateLock"
  }
}

3.2 Pulumi 상태 관리

// Pulumi.yaml
name: web-application
runtime: nodejs
backend:
  url: s3://company-pulumi-state

// index.ts에서 상태 참조
import * as pulumi from "@pulumi/pulumi";

const stackRef = new pulumi.StackReference("company/networking/production");
const vpcId = stackRef.getOutput("vpcId");
const subnetIds = stackRef.getOutput("privateSubnetIds");

4. 환경별 관리 전략

4.1 Workspace 기반 관리

# Terraform workspace 생성 및 관리
terraform workspace new development
terraform workspace new staging
terraform workspace new production

# 환경별 변수 파일
terraform apply -var-file="environments/development.tfvars"
terraform apply -var-file="environments/production.tfvars"

4.2 환경별 변수 파일

# environments/development.tfvars
environment = "development"
instance_type = "t3.micro"
min_size = 1
max_size = 3
desired_capacity = 1

# environments/production.tfvars
environment = "production"
instance_type = "t3.large"
min_size = 3
max_size = 10
desired_capacity = 5

4.3 조건부 리소스 생성

# 환경에 따른 조건부 생성
resource "aws_cloudwatch_log_group" "app_logs" {
  count = var.environment == "production" ? 1 : 0

  name              = "/aws/application/${var.app_name}"
  retention_in_days = 30
}

# 환경별 설정
locals {
  environment_config = {
    development = {
      instance_type = "t3.micro"
      replica_count = 1
      storage_size  = 20
    }
    staging = {
      instance_type = "t3.small"
      replica_count = 2
      storage_size  = 50
    }
    production = {
      instance_type = "t3.large"
      replica_count = 3
      storage_size  = 100
    }
  }

  config = local.environment_config[var.environment]
}

resource "aws_instance" "app" {
  count = local.config.replica_count

  ami           = data.aws_ami.ubuntu.id
  instance_type = local.config.instance_type
}

5. 보안 모범 사례

5.1 IAM 역할 기반 접근

# terraform-execution-role.tf
resource "aws_iam_role" "terraform_execution" {
  name = "TerraformExecutionRole"

  assume_role_policy = jsonencode({
    Version = "2012-10-17"
    Statement = [
      {
        Action = "sts:AssumeRole"
        Effect = "Allow"
        Principal = {
          Service = "ec2.amazonaws.com"
        }
      },
      {
        Action = "sts:AssumeRole"
        Effect = "Allow"
        Principal = {
          AWS = "arn:aws:iam::${data.aws_caller_identity.current.account_id}:user/terraform-ci"
        }
        Condition = {
          StringEquals = {
            "sts:ExternalId" = var.external_id
          }
        }
      }
    ]
  })
}

resource "aws_iam_policy" "terraform_execution" {
  name = "TerraformExecutionPolicy"

  policy = jsonencode({
    Version = "2012-10-17"
    Statement = [
      {
        Effect = "Allow"
        Action = [
          "ec2:*",
          "rds:*",
          "s3:*",
          "iam:*",
          "lambda:*",
          "apigateway:*",
          "cloudformation:*"
        ]
        Resource = "*"
      },
      {
        Effect = "Deny"
        Action = [
          "iam:DeleteRole",
          "iam:DeleteUser",
          "iam:DeletePolicy"
        ]
        Resource = "*"
        Condition = {
          StringNotEquals = {
            "aws:PrincipalTag/Environment" = var.environment
          }
        }
      }
    ]
  })
}

5.2 시크릿 관리

# AWS Secrets Manager 활용
resource "aws_secretsmanager_secret" "database_password" {
  name                    = "${var.environment}-database-password"
  description             = "Database password for ${var.environment}"
  recovery_window_in_days = 7

  tags = {
    Environment = var.environment
    Application = var.application_name
  }
}

resource "aws_secretsmanager_secret_version" "database_password" {
  secret_id = aws_secretsmanager_secret.database_password.id
  secret_string = jsonencode({
    password = random_password.database.result
  })
}

resource "random_password" "database" {
  length  = 32
  special = true
}

# RDS에서 시크릿 사용
resource "aws_db_instance" "main" {
  identifier = "${var.environment}-database"

  engine         = "postgres"
  engine_version = "13.7"
  instance_class = "db.t3.micro"

  allocated_storage = 20
  storage_encrypted = true

  db_name  = var.database_name
  username = var.database_username
  password = random_password.database.result

  manage_master_user_password = true

  vpc_security_group_ids = [aws_security_group.database.id]
  db_subnet_group_name   = aws_db_subnet_group.main.name

  backup_retention_period = var.environment == "production" ? 7 : 1
  backup_window          = "03:00-04:00"
  maintenance_window     = "sun:04:00-sun:05:00"

  skip_final_snapshot = var.environment != "production"

  tags = {
    Environment = var.environment
  }
}

5.3 리소스 태깅 전략

# 공통 태그 정의
locals {
  common_tags = {
    Environment   = var.environment
    Project       = var.project_name
    Owner         = var.team_name
    CostCenter    = var.cost_center
    CreatedBy     = "Terraform"
    CreatedAt     = timestamp()
  }
}

# 태그 정책 적용
resource "aws_organizations_policy" "tagging_policy" {
  name        = "RequiredTaggingPolicy"
  description = "Enforce required tags on resources"
  type        = "TAG_POLICY"

  content = jsonencode({
    tags = {
      Environment = {
        tag_key = {
          "@@assign" = "Environment"
        }
        tag_value = {
          "@@assign" = ["development", "staging", "production"]
        }
        enforced_for = {
          "@@assign" = ["ec2:instance", "rds:db", "s3:bucket"]
        }
      }
    }
  })
}

6. CI/CD 통합

6.1 GitHub Actions 워크플로우

# .github/workflows/terraform.yml
name: Terraform CI/CD

on:
  push:
    branches: [ main, develop ]
  pull_request:
    branches: [ main ]

env:
  TF_VERSION: 1.5.0
  AWS_REGION: us-west-2

jobs:
  validate:
    name: Validate Terraform
    runs-on: ubuntu-latest

    steps:
    - name: Checkout code
      uses: actions/checkout@v3

    - name: Setup Terraform
      uses: hashicorp/setup-terraform@v2
      with:
        terraform_version: ${{ env.TF_VERSION }}

    - name: Configure AWS credentials
      uses: aws-actions/configure-aws-credentials@v2
      with:
        role-to-assume: ${{ secrets.AWS_ROLE_ARN }}
        aws-region: ${{ env.AWS_REGION }}

    - name: Terraform Format Check
      run: terraform fmt -check -recursive

    - name: Terraform Init
      run: terraform init

    - name: Terraform Validate
      run: terraform validate

    - name: Terraform Plan
      run: terraform plan -out=tfplan

    - name: Upload Plan
      uses: actions/upload-artifact@v3
      with:
        name: terraform-plan
        path: tfplan

  security-scan:
    name: Security Scan
    runs-on: ubuntu-latest
    needs: validate

    steps:
    - name: Checkout code
      uses: actions/checkout@v3

    - name: Run Checkov
      id: checkov
      uses: bridgecrewio/checkov-action@master
      with:
        directory: .
        framework: terraform
        output_format: sarif
        output_file_path: checkov.sarif

    - name: Upload SARIF file
      uses: github/codeql-action/upload-sarif@v2
      if: success() || failure()
      with:
        sarif_file: checkov.sarif

  apply:
    name: Apply Terraform
    runs-on: ubuntu-latest
    needs: [validate, security-scan]
    if: github.ref == 'refs/heads/main' && github.event_name == 'push'
    environment: production

    steps:
    - name: Checkout code
      uses: actions/checkout@v3

    - name: Setup Terraform
      uses: hashicorp/setup-terraform@v2
      with:
        terraform_version: ${{ env.TF_VERSION }}

    - name: Configure AWS credentials
      uses: aws-actions/configure-aws-credentials@v2
      with:
        role-to-assume: ${{ secrets.AWS_ROLE_ARN }}
        aws-region: ${{ env.AWS_REGION }}

    - name: Download Plan
      uses: actions/download-artifact@v3
      with:
        name: terraform-plan

    - name: Terraform Init
      run: terraform init

    - name: Terraform Apply
      run: terraform apply -auto-approve tfplan

6.2 GitLab CI 파이프라인

# .gitlab-ci.yml
stages:
  - validate
  - plan
  - security
  - apply

variables:
  TF_VERSION: "1.5.0"
  TF_ROOT: ${CI_PROJECT_DIR}
  TF_ADDRESS: ${CI_API_V4_URL}/projects/${CI_PROJECT_ID}/terraform/state/${CI_ENVIRONMENT_NAME}

cache:
  key: "${TF_ROOT}"
  paths:
    - ${TF_ROOT}/.terraform

before_script:
  - cd ${TF_ROOT}
  - gitlab-terraform init

validate:
  stage: validate
  image:
    name: hashicorp/terraform:${TF_VERSION}
    entrypoint: [""]
  script:
    - gitlab-terraform fmt -check
    - gitlab-terraform validate
  rules:
    - if: $CI_PIPELINE_SOURCE == "merge_request_event"
    - if: $CI_COMMIT_BRANCH == $CI_DEFAULT_BRANCH

plan:
  stage: plan
  image:
    name: hashicorp/terraform:${TF_VERSION}
    entrypoint: [""]
  script:
    - gitlab-terraform plan
    - gitlab-terraform plan-json
  artifacts:
    name: plan
    paths:
      - ${TF_ROOT}/plan.cache
    reports:
      terraform: ${TF_ROOT}/plan.json
  rules:
    - if: $CI_PIPELINE_SOURCE == "merge_request_event"
    - if: $CI_COMMIT_BRANCH == $CI_DEFAULT_BRANCH

security-scan:
  stage: security
  image:
    name: bridgecrew/checkov:latest
    entrypoint: [""]
  script:
    - checkov -d . --framework terraform --output cli --output junitxml --output-file-path console,checkov.xml
  artifacts:
    reports:
      junit: checkov.xml
  rules:
    - if: $CI_PIPELINE_SOURCE == "merge_request_event"
    - if: $CI_COMMIT_BRANCH == $CI_DEFAULT_BRANCH

apply:
  stage: apply
  image:
    name: hashicorp/terraform:${TF_VERSION}
    entrypoint: [""]
  script:
    - gitlab-terraform apply
  dependencies:
    - plan
  rules:
    - if: $CI_COMMIT_BRANCH == $CI_DEFAULT_BRANCH
      when: manual
  only:
    - main

7. 테스트 전략

7.1 Terraform 단위 테스트

// tests/networking_test.go
package test

import (
    "testing"

    "github.com/gruntwork-io/terratest/modules/aws"
    "github.com/gruntwork-io/terratest/modules/terraform"
    "github.com/stretchr/testify/assert"
)

func TestNetworkingModule(t *testing.T) {
    t.Parallel()

    terraformOptions := terraform.WithDefaultRetryableErrors(t, &terraform.Options{
        TerraformDir: "../modules/networking",
        Vars: map[string]interface{}{
            "name_prefix": "test",
            "vpc_cidr": "10.0.0.0/16",
            "public_subnet_cidrs": []string{"10.0.1.0/24", "10.0.2.0/24"},
            "private_subnet_cidrs": []string{"10.0.11.0/24", "10.0.12.0/24"},
            "enable_nat_gateway": true,
            "common_tags": map[string]string{
                "Environment": "test",
                "Project": "terratest",
            },
        },
    })

    defer terraform.Destroy(t, terraformOptions)
    terraform.InitAndApply(t, terraformOptions)

    // VPC 검증
    vpcId := terraform.Output(t, terraformOptions, "vpc_id")
    assert.NotEmpty(t, vpcId)

    vpc := aws.GetVpcById(t, vpcId, "us-west-2")
    assert.Equal(t, "10.0.0.0/16", *vpc.CidrBlock)

    // 서브넷 검증
    publicSubnetIds := terraform.OutputList(t, terraformOptions, "public_subnet_ids")
    assert.Len(t, publicSubnetIds, 2)

    for _, subnetId := range publicSubnetIds {
        subnet := aws.GetSubnetById(t, subnetId, "us-west-2")
        assert.True(t, *subnet.MapPublicIpOnLaunch)
    }
}

7.2 Pulumi 테스트

// tests/networking.test.ts
import * as pulumi from "@pulumi/pulumi";
import * as aws from "@pulumi/aws";
import { expect } from "chai";
import "mocha";

pulumi.runtime.setMocks({
    newResource: (args: pulumi.runtime.MockResourceArgs): {id: string, state: any} => {
        switch (args.type) {
            case "aws:ec2/vpc:Vpc":
                return { id: "vpc-12345", state: { cidrBlock: "10.0.0.0/16" } };
            case "aws:ec2/subnet:Subnet":
                return { id: "subnet-12345", state: { cidrBlock: "10.0.1.0/24" } };
            default:
                return { id: args.name + "_id", state: args.inputs };
        }
    },
    call: (args: pulumi.runtime.MockCallArgs) => {
        return args.inputs;
    },
});

describe("Networking Infrastructure", () => {
    let infra: typeof import("../index");

    before(async () => {
        infra = await import("../index");
    });

    it("should create VPC with correct CIDR", (done) => {
        pulumi.all([infra.vpc.cidrBlock]).apply(([cidrBlock]) => {
            expect(cidrBlock).to.equal("10.0.0.0/16");
            done();
        });
    });

    it("should create public subnets", (done) => {
        pulumi.all([infra.publicSubnets]).apply(([subnets]) => {
            expect(subnets).to.have.lengthOf(2);
            done();
        });
    });
});

7.3 통합 테스트

# tests/integration/test_full_stack.py
import boto3
import pytest
import time
from moto import mock_ec2, mock_rds

@pytest.fixture
def aws_credentials():
    """Mocked AWS Credentials for moto."""
    import os
    os.environ["AWS_ACCESS_KEY_ID"] = "testing"
    os.environ["AWS_SECRET_ACCESS_KEY"] = "testing"
    os.environ["AWS_SECURITY_TOKEN"] = "testing"
    os.environ["AWS_SESSION_TOKEN"] = "testing"

@mock_ec2
@mock_rds
def test_infrastructure_deployment(aws_credentials):
    """Test complete infrastructure deployment."""

    # EC2 클라이언트 생성
    ec2 = boto3.client("ec2", region_name="us-west-2")
    rds = boto3.client("rds", region_name="us-west-2")

    # VPC 생성 테스트
    vpc_response = ec2.create_vpc(CidrBlock="10.0.0.0/16")
    vpc_id = vpc_response["Vpc"]["VpcId"]

    # 서브넷 생성 테스트
    subnet_response = ec2.create_subnet(
        VpcId=vpc_id,
        CidrBlock="10.0.1.0/24"
    )
    subnet_id = subnet_response["Subnet"]["SubnetId"]

    # 보안 그룹 생성 테스트
    sg_response = ec2.create_security_group(
        GroupName="test-sg",
        Description="Test security group",
        VpcId=vpc_id
    )
    sg_id = sg_response["GroupId"]

    # 인스턴스 시작 테스트
    instances = ec2.run_instances(
        ImageId="ami-12345678",
        MinCount=1,
        MaxCount=1,
        InstanceType="t2.micro",
        SubnetId=subnet_id,
        SecurityGroupIds=[sg_id]
    )

    instance_id = instances["Instances"][0]["InstanceId"]

    # 인스턴스 상태 확인
    waiter = ec2.get_waiter("instance_running")
    waiter.wait(InstanceIds=[instance_id])

    # 검증
    instance = ec2.describe_instances(InstanceIds=[instance_id])
    assert instance["Reservations"][0]["Instances"][0]["State"]["Name"] == "running"

8. 비용 최적화

8.1 리소스 스케줄링

# 개발 환경 자동 종료
resource "aws_lambda_function" "instance_scheduler" {
  filename         = "instance_scheduler.zip"
  function_name    = "instance-scheduler"
  role            = aws_iam_role.lambda_role.arn
  handler         = "index.handler"
  runtime         = "python3.9"

  environment {
    variables = {
      ENVIRONMENT = var.environment
    }
  }
}

resource "aws_cloudwatch_event_rule" "stop_instances" {
  count = var.environment == "development" ? 1 : 0

  name                = "stop-dev-instances"
  description         = "Stop development instances at 6 PM"
  schedule_expression = "cron(0 18 * * MON-FRI *)"
}

resource "aws_cloudwatch_event_rule" "start_instances" {
  count = var.environment == "development" ? 1 : 0

  name                = "start-dev-instances"
  description         = "Start development instances at 9 AM"
  schedule_expression = "cron(0 9 * * MON-FRI *)"
}

8.2 비용 태깅 및 모니터링

# 비용 할당 태그
resource "aws_ce_cost_category" "project_costs" {
  name = "ProjectCosts"

  rule {
    value = "WebApplication"
    rule {
      and {
        tag {
          key           = "Project"
          values        = ["web-app"]
          match_options = ["EQUALS"]
        }
      }
    }
  }

  rule {
    value = "DataPipeline"
    rule {
      and {
        tag {
          key           = "Project"
          values        = ["data-pipeline"]
          match_options = ["EQUALS"]
        }
      }
    }
  }
}

# 비용 예산 설정
resource "aws_budgets_budget" "monthly_cost" {
  name         = "${var.environment}-monthly-budget"
  budget_type  = "COST"
  limit_amount = var.monthly_budget_limit
  limit_unit   = "USD"
  time_unit    = "MONTHLY"

  cost_filters = {
    Tag = ["Environment:${var.environment}"]
  }

  notification {
    comparison_operator        = "GREATER_THAN"
    threshold                 = 80
    threshold_type            = "PERCENTAGE"
    notification_type         = "ACTUAL"
    subscriber_email_addresses = [var.budget_notification_email]
  }

  notification {
    comparison_operator        = "GREATER_THAN"
    threshold                 = 100
    threshold_type            = "PERCENTAGE"
    notification_type          = "FORECASTED"
    subscriber_email_addresses = [var.budget_notification_email]
  }
}

9. 모니터링 및 관찰가능성

9.1 Terraform 드리프트 감지

# scripts/drift_detection.py
import boto3
import json
import subprocess
import sys
from datetime import datetime

class TerraformDriftDetector:
    def __init__(self, state_bucket, state_key, region):
        self.state_bucket = state_bucket
        self.state_key = state_key
        self.region = region
        self.s3_client = boto3.client('s3', region_name=region)

    def get_terraform_state(self):
        """Terraform 상태 파일에서 리소스 목록 추출"""
        try:
            response = self.s3_client.get_object(
                Bucket=self.state_bucket,
                Key=self.state_key
            )
            state_content = response['Body'].read()
            state_data = json.loads(state_content)

            resources = []
            for resource in state_data.get('resources', []):
                if resource['type'] == 'aws_instance':
                    for instance in resource['instances']:
                        resources.append({
                            'type': 'ec2_instance',
                            'id': instance['attributes']['id'],
                            'terraform_config': instance['attributes']
                        })

            return resources

        except Exception as e:
            print(f"Error reading Terraform state: {e}")
            return []

    def get_aws_resources(self, resource_type, resource_ids):
        """AWS에서 실제 리소스 상태 조회"""
        if resource_type == 'ec2_instance':
            ec2 = boto3.client('ec2', region_name=self.region)
            try:
                response = ec2.describe_instances(InstanceIds=resource_ids)
                actual_resources = {}

                for reservation in response['Reservations']:
                    for instance in reservation['Instances']:
                        actual_resources[instance['InstanceId']] = {
                            'instance_type': instance['InstanceType'],
                            'state': instance['State']['Name'],
                            'security_groups': [sg['GroupId'] for sg in instance['SecurityGroups']],
                            'tags': {tag['Key']: tag['Value'] for tag in instance.get('Tags', [])}
                        }

                return actual_resources

            except Exception as e:
                print(f"Error describing instances: {e}")
                return {}

    def detect_drift(self):
        """드리프트 감지 및 보고"""
        terraform_resources = self.get_terraform_state()
        drift_report = []

        for tf_resource in terraform_resources:
            resource_type = tf_resource['type']
            resource_id = tf_resource['id']
            tf_config = tf_resource['terraform_config']

            if resource_type == 'ec2_instance':
                aws_resources = self.get_aws_resources(resource_type, [resource_id])

                if resource_id not in aws_resources:
                    drift_report.append({
                        'resource_id': resource_id,
                        'drift_type': 'MISSING',
                        'description': f'Resource {resource_id} exists in Terraform but not in AWS'
                    })
                    continue

                aws_resource = aws_resources[resource_id]

                # 인스턴스 타입 체크
                if tf_config.get('instance_type') != aws_resource.get('instance_type'):
                    drift_report.append({
                        'resource_id': resource_id,
                        'drift_type': 'CONFIGURATION',
                        'attribute': 'instance_type',
                        'terraform_value': tf_config.get('instance_type'),
                        'aws_value': aws_resource.get('instance_type')
                    })

                # 태그 체크
                tf_tags = tf_config.get('tags', {})
                aws_tags = aws_resource.get('tags', {})

                for key, value in tf_tags.items():
                    if key not in aws_tags or aws_tags[key] != value:
                        drift_report.append({
                            'resource_id': resource_id,
                            'drift_type': 'TAG_MISMATCH',
                            'attribute': f'tag:{key}',
                            'terraform_value': value,
                            'aws_value': aws_tags.get(key, 'NOT_SET')
                        })

        return drift_report

    def generate_report(self):
        """드리프트 리포트 생성"""
        drift_issues = self.detect_drift()

        if not drift_issues:
            print("✅ No configuration drift detected")
            return True

        print("⚠️ Configuration drift detected:")
        print("=" * 50)

        for issue in drift_issues:
            print(f"Resource: {issue['resource_id']}")
            print(f"Type: {issue['drift_type']}")

            if 'attribute' in issue:
                print(f"Attribute: {issue['attribute']}")
                print(f"Terraform: {issue['terraform_value']}")
                print(f"AWS: {issue['aws_value']}")
            else:
                print(f"Description: {issue['description']}")

            print("-" * 30)

        return False

if __name__ == "__main__":
    detector = TerraformDriftDetector(
        state_bucket="company-terraform-state",
        state_key="environments/production/terraform.tfstate",
        region="us-west-2"
    )

    success = detector.generate_report()
    sys.exit(0 if success else 1)

9.2 CloudWatch 대시보드

# monitoring/cloudwatch.tf
resource "aws_cloudwatch_dashboard" "infrastructure" {
  dashboard_name = "${var.environment}-infrastructure"

  dashboard_body = jsonencode({
    widgets = [
      {
        type   = "metric"
        x      = 0
        y      = 0
        width  = 12
        height = 6

        properties = {
          metrics = [
            ["AWS/EC2", "CPUUtilization", "AutoScalingGroupName", aws_autoscaling_group.web.name],
            ["AWS/ApplicationELB", "TargetResponseTime", "LoadBalancer", aws_lb.web.arn_suffix],
            ["AWS/RDS", "DatabaseConnections", "DBInstanceIdentifier", aws_db_instance.main.id]
          ]
          view    = "timeSeries"
          stacked = false
          region  = var.aws_region
          title   = "Infrastructure Metrics"
          period  = 300
        }
      },
      {
        type   = "log"
        x      = 0
        y      = 6
        width  = 24
        height = 6

        properties = {
          query = <<-EOT
            SOURCE '/aws/lambda/infrastructure-monitor'
            | fields @timestamp, @message
            | filter @message like /ERROR/
            | sort @timestamp desc
            | limit 20
          EOT
          region = var.aws_region
          title  = "Infrastructure Errors"
          view   = "table"
        }
      }
    ]
  })
}

# 알람 설정
resource "aws_cloudwatch_metric_alarm" "high_cpu" {
  alarm_name          = "${var.environment}-high-cpu-utilization"
  comparison_operator = "GreaterThanThreshold"
  evaluation_periods  = "2"
  metric_name         = "CPUUtilization"
  namespace           = "AWS/EC2"
  period              = "300"
  statistic           = "Average"
  threshold           = "80"
  alarm_description   = "This metric monitors ec2 cpu utilization"
  alarm_actions       = [aws_sns_topic.alerts.arn]

  dimensions = {
    AutoScalingGroupName = aws_autoscaling_group.web.name
  }
}

10. 대규모 환경 관리

10.1 계층형 모듈 구조

infrastructure/
├── modules/
│   ├── foundation/
│   │   ├── networking/
│   │   ├── security/
│   │   └── monitoring/
│   ├── platform/
│   │   ├── compute/
│   │   ├── data/
│   │   └── messaging/
│   └── applications/
│       ├── web-app/
│       ├── api-gateway/
│       └── microservices/
├── environments/
│   ├── development/
│   ├── staging/
│   └── production/
└── shared/
    ├── terraform.tf
    ├── variables.tf
    └── outputs.tf

10.2 의존성 관리

# environments/production/main.tf
module "foundation" {
  source = "../../modules/foundation"

  environment      = "production"
  vpc_cidr        = "10.0.0.0/16"
  availability_zones = ["us-west-2a", "us-west-2b", "us-west-2c"]

  common_tags = local.common_tags
}

module "platform" {
  source = "../../modules/platform"

  vpc_id              = module.foundation.vpc_id
  private_subnet_ids  = module.foundation.private_subnet_ids
  public_subnet_ids   = module.foundation.public_subnet_ids

  database_subnet_group = module.foundation.database_subnet_group_name
  cache_subnet_group    = module.foundation.cache_subnet_group_name

  common_tags = local.common_tags

  depends_on = [module.foundation]
}

module "applications" {
  for_each = var.applications

  source = "../../modules/applications/${each.key}"

  vpc_id             = module.foundation.vpc_id
  subnet_ids         = module.foundation.private_subnet_ids
  load_balancer_arn  = module.platform.application_load_balancer_arn

  database_endpoint = module.platform.database_endpoint
  cache_endpoint    = module.platform.cache_endpoint

  application_config = each.value

  common_tags = local.common_tags

  depends_on = [module.platform]
}

10.3 멀티 리전 배포

# multi-region/main.tf
locals {
  regions = ["us-west-2", "us-east-1", "eu-west-1"]
}

module "regional_infrastructure" {
  for_each = toset(local.regions)

  source = "./modules/regional"

  region      = each.value
  environment = var.environment

  # 글로벌 설정
  global_tags = var.global_tags

  # 리전별 설정
  primary_region = each.value == "us-west-2"

  # 교차 리전 복제
  replication_regions = [for r in local.regions : r if r != each.value]

  providers = {
    aws = aws.${replace(each.value, "-", "_")}
  }
}

# Route 53 헬스 체크 및 장애 조치
resource "aws_route53_health_check" "regional" {
  for_each = toset(local.regions)

  fqdn                            = module.regional_infrastructure[each.value].load_balancer_dns
  port                           = 443
  type                           = "HTTPS"
  resource_path                  = "/health"
  failure_threshold              = "3"
  request_interval               = "30"
  cloudwatch_alarm_region        = each.value
  cloudwatch_alarm_name          = "HealthCheck-${each.value}"
  insufficient_data_health_status = "Failure"

  tags = {
    Name = "HealthCheck-${each.value}"
  }
}

resource "aws_route53_record" "failover" {
  for_each = toset(local.regions)

  zone_id = aws_route53_zone.main.zone_id
  name    = var.domain_name
  type    = "A"

  set_identifier = each.value

  failover_routing_policy {
    type = each.value == "us-west-2" ? "PRIMARY" : "SECONDARY"
  }

  alias {
    name                   = module.regional_infrastructure[each.value].load_balancer_dns
    zone_id                = module.regional_infrastructure[each.value].load_balancer_zone_id
    evaluate_target_health = true
  }

  health_check_id = aws_route53_health_check.regional[each.value].id
}

11. 트러블슈팅 가이드

11.1 일반적인 문제 해결

#!/bin/bash
# scripts/terraform_troubleshoot.sh

echo "Terraform 트러블슈팅 도구"
echo "========================"

# 상태 파일 검사
check_state_file() {
    echo "상태 파일 검사 중..."

    if terraform state list > /dev/null 2>&1; then
        echo "✅ 상태 파일 접근 가능"

        # 고아 리소스 검사
        terraform state list | while read resource; do
            if ! terraform show "$resource" > /dev/null 2>&1; then
                echo "⚠️  고아 리소스 발견: $resource"
            fi
        done
    else
        echo "❌ 상태 파일 접근 불가"
        echo "백엔드 설정을 확인하세요"
        exit 1
    fi
}

# 의존성 검사
check_dependencies() {
    echo "의존성 검사 중..."

    # Provider 버전 검사
    terraform version

    # 초기화 상태 검사
    if [ ! -d ".terraform" ]; then
        echo "⚠️  Terraform이 초기화되지 않았습니다"
        echo "terraform init을 실행하세요"
    fi

    # 플러그인 검사
    terraform providers
}

# 구성 검증
validate_configuration() {
    echo "구성 검증 중..."

    # 형식 검사
    terraform fmt -check -recursive

    # 구성 검증
    terraform validate

    # 계획 검사
    if terraform plan -detailed-exitcode > /dev/null 2>&1; then
        echo "✅ 계획 검사 통과"
    else
        exit_code=$?
        case $exit_code in
            1)
                echo "❌ 계획 실패 - 구성 오류"
                ;;
            2)
                echo "ℹ️  계획 성공 - 변경 사항 있음"
                ;;
        esac
    fi
}

# 리소스 상태 확인
check_resource_drift() {
    echo "리소스 드리프트 검사 중..."

    terraform plan -detailed-exitcode > plan_output.txt 2>&1
    exit_code=$?

    if [ $exit_code -eq 2 ]; then
        echo "⚠️  구성 드리프트 감지됨"
        echo "변경 사항:"
        grep -A 5 -B 5 "will be" plan_output.txt
    fi

    rm -f plan_output.txt
}

# 메인 실행
main() {
    check_state_file
    echo ""
    check_dependencies
    echo ""
    validate_configuration
    echo ""
    check_resource_drift
}

main "$@"

11.2 성능 최적화

# terraform/performance.tf

# 병렬 처리 최적화
terraform {
  experiments = [module_variable_optional_attrs]
}

# Provider 별칭을 통한 병렬 처리
provider "aws" {
  alias  = "us_west_2"
  region = "us-west-2"
}

provider "aws" {
  alias  = "us_east_1"
  region = "us-east-1"
}

# 데이터 소스 최적화
data "aws_availability_zones" "available" {
  state = "available"

  # 필터링으로 성능 향상
  filter {
    name   = "zone-type"
    values = ["availability-zone"]
  }
}

# 지역 리소스 병렬 생성
resource "aws_s3_bucket" "regional" {
  for_each = {
    us_west_2 = "us-west-2"
    us_east_1 = "us-east-1"
  }

  provider = aws.${each.key}

  bucket = "company-${each.value}-${random_string.suffix.result}"
}

# 리소스 그룹화로 의존성 최적화
locals {
  # 보안 그룹을 먼저 생성
  security_groups = {
    web = {
      name_prefix = "web-sg"
      ingress = [
        { from_port = 80, to_port = 80, protocol = "tcp", cidr_blocks = ["0.0.0.0/0"] },
        { from_port = 443, to_port = 443, protocol = "tcp", cidr_blocks = ["0.0.0.0/0"] }
      ]
    }
    app = {
      name_prefix = "app-sg"
      ingress = [
        { from_port = 8080, to_port = 8080, protocol = "tcp", security_groups = ["web"] }
      ]
    }
  }
}

resource "aws_security_group" "this" {
  for_each = local.security_groups

  name_prefix = each.value.name_prefix
  vpc_id      = aws_vpc.main.id

  dynamic "ingress" {
    for_each = each.value.ingress
    content {
      from_port   = ingress.value.from_port
      to_port     = ingress.value.to_port
      protocol    = ingress.value.protocol
      cidr_blocks = lookup(ingress.value, "cidr_blocks", null)
      security_groups = lookup(ingress.value, "security_groups", null) != null ?
        [aws_security_group.this[ingress.value.security_groups[0]].id] : null
    }
  }
}

12. 결론

Infrastructure as Code는 현대적인 클라우드 인프라 관리의 핵심입니다. 2026년 현재, 다음과 같은 트렌드가 주목받고 있습니다:

주요 발전 방향

  1. 선언적 구성의 진화

    • GitOps 워크플로우 통합
    • 정책 기반 관리 (OPA, Sentinel)
    • 지속적 드리프트 감지
  2. 멀티 클라우드 전략

    • 클라우드 중립적 추상화
    • 하이브리드 인프라 관리
    • 비용 최적화 자동화
  3. 보안 강화

    • 시크릿 제로 접근 방식
    • 런타임 보안 정책
    • 컴플라이언스 자동화
  4. 관찰가능성 확대

    • 인프라 메트릭 통합
    • 예측적 장애 감지
    • 비용 투명성 향상

구현 권장사항

# 프로젝트 체크리스트
infrastructure_checklist:
  planning:
    - [ ] 도구 선택 및 평가
    - [ ] 모듈 설계 및 구조화
    - [ ] 환경별 전략 수립

  implementation:
    - [ ] 코드 품질 표준 정의
    - [ ] CI/CD 파이프라인 구축
    - [ ] 테스트 전략 수립

  operations:
    - [ ] 모니터링 및 알림 설정
    - [ ] 백업 및 복구 계획
    - [ ] 드리프트 감지 자동화

  governance:
    - [ ] 보안 정책 적용
    - [ ] 비용 관리 체계
    - [ ] 컴플라이언스 검증

이 가이드를 통해 현대적이고 확장 가능한 Infrastructure as Code 환경을 구축하고, 지속적으로 발전하는 클라우드 생태계에 효과적으로 대응할 수 있을 것입니다.

참고 자료

궁금한 점이 있으신가요?

문의사항이 있으시면 언제든지 연락주세요.