Aurora Serverless v2 と Redshift Serverless の Zero-ETL統合 でコスパ最強のデータ分析基盤

データ分析基盤を作りたいと思ったことは誰しもあるかと思います。

AWSではOLAPワークロードにAurora、OLAPワークロードにRedshiftが提供されていましたが、どちらも最低利用料が1万程度になっていました。
しかしながら、近年ではServerless版が発表され負荷の軽いワークロードでは、低価格を実現することが可能となりました。

今回は、Terraformで簡単にAurora Serverless v2 と Redshift Serverless の Zero-ETL統合を行いたいと思います。

Terraformコード
data "aws_caller_identity" "this" {}

resource "random_id" "this" {
  byte_length = 4
}

#
# Aurora Serverless
# 

# Aurora Serverless - PostgreSQL

resource "aws_rds_cluster" "this" {
  cluster_identifier              = "${var.service}-${var.environment}-cluster"
  engine                          = "aurora-postgresql"
  engine_mode                     = "provisioned"
  engine_version                  = "16.6"
  database_name                   = "${var.service}_${var.environment}_db"
  master_username                 = "${var.service}_${var.environment}_admin"
  master_password                 = var.rds_password
  storage_encrypted               = true
  enabled_cloudwatch_logs_exports = ["postgresql"]
  backup_retention_period         = 7
  final_snapshot_identifier       = "${var.service}-${var.environment}-cluster-${random_id.this.hex}"
  db_cluster_parameter_group_name = aws_rds_cluster_parameter_group.this.name
  db_subnet_group_name            = aws_db_subnet_group.rds.name
  vpc_security_group_ids          = [aws_security_group.rds.id]
  network_type                    = "DUAL"

  serverlessv2_scaling_configuration {
    max_capacity = 1.0
    min_capacity = 0
  }

  lifecycle {
    ignore_changes = [
      engine_version
    ]
  }
}

resource "aws_rds_cluster_instance" "this" {
  identifier                   = "${var.service}-${var.environment}-instance"
  cluster_identifier           = aws_rds_cluster.this.id
  instance_class               = "db.serverless"
  engine                       = aws_rds_cluster.this.engine
  engine_version               = aws_rds_cluster.this.engine_version
  db_subnet_group_name         = aws_db_subnet_group.rds.name
  db_parameter_group_name      = aws_db_parameter_group.this.name
  performance_insights_enabled = var.environment == "prod" ? true : false
}

# Aurora Serverless - Security group

resource "aws_security_group" "rds" {
  name   = "${var.service}-${var.environment}-rds-sg"
  vpc_id = var.vpc.id
}

# Aurora Serverless - Parameter group

resource "aws_rds_cluster_parameter_group" "this" {
  name   = "${var.service}-${var.environment}-cluster-parameter"
  family = "aurora-postgresql16"

  parameter {
    apply_method = "pending-reboot"
    name         = "rds.logical_replication"
    value        = 1
  }
  parameter {
    apply_method = "pending-reboot"
    name         = "aurora.enhanced_logical_replication"
    value        = 1
  }
  parameter {
    apply_method = "pending-reboot"
    name         = "aurora.logical_replication_backup"
    value        = 0
  }
  parameter {
    apply_method = "pending-reboot"
    name         = "aurora.logical_replication_globaldb"
    value        = 0
  }
}

resource "aws_db_parameter_group" "this" {
  name   = "${var.service}-${var.environment}-instance-parameter"
  family = "aurora-postgresql16"
}

# Aurora Serverless - Subnet group

resource "aws_db_subnet_group" "rds" {
  name        = "${var.service}-${var.environment}-rds-sg"
  description = "${var.service}-${var.environment}-rds-sg"
  subnet_ids  = var.database_subnet_ids
}

#
# Redshift Serverless
#

# RedShift Serverless - Namesapce

resource "aws_redshiftserverless_namespace" "this" {
  namespace_name = "${var.service}-${var.environment}-redshift-serverless-namespace"
  db_name        = "${var.service}-${var.environment}-db"
}

# RedShift Serverless - Workgroup

resource "aws_redshiftserverless_workgroup" "this" {
  namespace_name       = aws_redshiftserverless_namespace.this.namespace_name
  workgroup_name       = "${var.service}-${var.environment}-redshift-serverless-workgroup"
  subnet_ids           = aws_db_subnet_group.redshift.subnet_ids
  security_group_ids   = [aws_security_group.redshift.id]
  base_capacity        = 8
  max_capacity         = 256
  enhanced_vpc_routing = true
  publicly_accessible  = false
  port                 = 5439


  config_parameter {
    parameter_key   = "auto_mv"
    parameter_value = "true"
  }
  config_parameter {
    parameter_key   = "datestyle"
    parameter_value = "ISO, MDY"
  }
  config_parameter {
    parameter_key   = "enable_case_sensitive_identifier"
    parameter_value = "true"
  }
  config_parameter {
    parameter_key   = "enable_user_activity_logging"
    parameter_value = "true"
  }
  config_parameter {
    parameter_key   = "max_query_execution_time"
    parameter_value = "14400"
  }
  config_parameter {
    parameter_key   = "query_group"
    parameter_value = "default"
  }
  config_parameter {
    parameter_key   = "require_ssl"
    parameter_value = "true"
  }
  config_parameter {
    parameter_key   = "search_path"
    parameter_value = "$user, public"
  }
  config_parameter {
    parameter_key   = "use_fips_ssl"
    parameter_value = "false"
  }
}

data "aws_iam_policy_document" "this" {
  statement {
    effect = "Allow"
    principals {
      type        = "Service"
      identifiers = ["redshift.amazonaws.com"]
    }
    actions = [
      "redshift:AuthorizeInboundIntegration"
    ]
    condition {
      test     = "StringEquals"
      variable = "aws:SourceArn"
      values   = [aws_rds_cluster.this.arn]
    }
  }
  statement {
    effect = "Allow"
    principals {
      type        = "AWS"
      identifiers = ["arn:aws:iam::${data.aws_caller_identity.this.account_id}:root"]
    }
    actions = [
      "redshift:CreateInboundIntegration",
    ]
  }
}

resource "aws_redshift_resource_policy" "this" {
  resource_arn = aws_redshiftserverless_namespace.this.arn
  policy       = data.aws_iam_policy_document.this.json
}

# RedShift Serverless - Security group

resource "aws_security_group" "redshift" {
  name   = "${var.service}-${var.environment}-redshift-sg"
  vpc_id = var.vpc.id
}

# RedShift Serverless - Subnet group

resource "aws_db_subnet_group" "redshift" {
  name       = "${var.service}-${var.environment}-redshift-subnet-group"
  subnet_ids = var.database_subnet_ids
}

#
# Zero-ETL intergration
#

resource "aws_rds_integration" "this" {
  integration_name = "${var.service}-${var.environment}-zero-etl-intergration"
  source_arn       = aws_rds_cluster.this.arn
  target_arn       = aws_redshiftserverless_namespace.this.arn
  data_filter      = "include: ${var.service}_${var.environment}_db.*.*"

  lifecycle {
    ignore_changes = [
      kms_key_id
    ]
  }
}
コード解説

重要な部分のみ解説します。

Aurora parameter

以下の4つのパラメータが必須です。作成済みのAuroraに対して適用する場合は、手動で再起動が必要です。

# Aurora Serverless - Parameter group

resource "aws_rds_cluster_parameter_group" "this" {
  name   = "${var.service}-${var.environment}-cluster-parameter"
  family = "aurora-postgresql16"

  parameter {
    apply_method = "pending-reboot"
    name         = "rds.logical_replication"
    value        = 1
  }
  parameter {
    apply_method = "pending-reboot"
    name         = "aurora.enhanced_logical_replication"
    value        = 1
  }
  parameter {
    apply_method = "pending-reboot"
    name         = "aurora.logical_replication_backup"
    value        = 0
  }
  parameter {
    apply_method = "pending-reboot"
    name         = "aurora.logical_replication_globaldb"
    value        = 0
  }
}
Redshift config parameter

Redshift へのパラメータ変更点です。
enable_case_sensitive_identifier = “true”がZero-ETL統合では必須です。
他のパラメータはデフォルトでも問題ないですが、terraform-aws-providerのバグ(https://github.com/pulumi/pulumi-aws/issues/4405)でデフォルトのパラメータでも明示的に指定する必要があります。指定しない場合は変更がないのにも関わらず変更を検知して、エラーとなる場合があります。

  <em># RedShift Serverless - Workgroup</em>
)

 config_parameter {
    parameter_key   = "auto_mv"
    parameter_value = "true"
  }
  config_parameter {
    parameter_key   = "datestyle"
    parameter_value = "ISO, MDY"
  }
  config_parameter {
    parameter_key   = "enable_case_sensitive_identifier"
    parameter_value = "true" <em>#trueが必須</em>
  }
  config_parameter {
    parameter_key   = "enable_user_activity_logging"
    parameter_value = "true"
  }
  config_parameter {
    parameter_key   = "max_query_execution_time"
    parameter_value = "14400"
  }
  config_parameter {
    parameter_key   = "query_group"
    parameter_value = "default"
  }
  config_parameter {
    parameter_key   = "require_ssl"
    parameter_value = "true"
  }
  config_parameter {
    parameter_key   = "search_path"
    parameter_value = "$user, public"
  }
  config_parameter {
    parameter_key   = "use_fips_ssl"
    parameter_value = "false"
  }
Redshift resource policy 

AuroraにRedshift を読み込む権限と、AWSアカウント内のユーザーにETL統合を操作する権限を付与する必要があります。
リソースポリシーは、aws_redshift_serverless_resource_policyではなく、aws_redshift_resource_policyを利用することに注意してください。

data "aws_iam_policy_document" "this" {
  statement {
    effect = "Allow"
    principals {
      type        = "Service"
      identifiers = ["redshift.amazonaws.com"]
    }
    actions = [
      "redshift:AuthorizeInboundIntegration"
    ]
    condition {
      test     = "StringEquals"
      variable = "aws:SourceArn"
      values   = [aws_rds_cluster.this.arn]
    }
  }
  statement {
    effect = "Allow"
    principals {
      type        = "AWS"
      identifiers = ["arn:aws:iam::${data.aws_caller_identity.this.account_id}:root"]
    }
    actions = [
      "redshift:CreateInboundIntegration",
    ]
  }
}

resource "aws_redshift_resource_policy" "this" {
  resource_arn = aws_redshiftserverless_namespace.this.arn
  policy       = data.aws_iam_policy_document.this.json
}
ETL統合

以上の設定が済んだら、ETL統合を行います。お疲れ様でした。

#
# Zero-ETL intergration
#

resource "aws_rds_integration" "this" {
  integration_name = "${var.service}-${var.environment}-zero-etl-intergration"
  source_arn       = aws_rds_cluster.this.arn
  target_arn       = aws_redshiftserverless_namespace.this.arn
  data_filter      = "include: ${var.service}_${var.environment}_db.*.*"

  lifecycle {
    ignore_changes = [
      kms_key_id
    ]
  }
}