データ分析基盤を作りたいと思ったことは誰しもあるかと思います。
AWSではOLAPワークロードにAurora、OLAPワークロードにRedshiftが提供されていましたが、どちらも最低利用料が1万程度になっていました。
しかしながら、近年ではServerless版が発表され負荷の軽いワークロードでは、低価格を実現することが可能となりました。
今回は、Terraformで簡単にAurora Serverless v2 と Redshift Serverless の Zero-ETL統合を行いたいと思います。
Terraformコード
data "aws_caller_identity" "this" {}
resource "random_id" "this" {
byte_length = 4
}
#
# Aurora Serverless
#
# Aurora Serverless - PostgreSQL
resource "aws_rds_cluster" "this" {
cluster_identifier = "${var.service}-${var.environment}-cluster"
engine = "aurora-postgresql"
engine_mode = "provisioned"
engine_version = "16.6"
database_name = "${var.service}_${var.environment}_db"
master_username = "${var.service}_${var.environment}_admin"
master_password = var.rds_password
storage_encrypted = true
enabled_cloudwatch_logs_exports = ["postgresql"]
backup_retention_period = 7
final_snapshot_identifier = "${var.service}-${var.environment}-cluster-${random_id.this.hex}"
db_cluster_parameter_group_name = aws_rds_cluster_parameter_group.this.name
db_subnet_group_name = aws_db_subnet_group.rds.name
vpc_security_group_ids = [aws_security_group.rds.id]
network_type = "DUAL"
serverlessv2_scaling_configuration {
max_capacity = 1.0
min_capacity = 0
}
lifecycle {
ignore_changes = [
engine_version
]
}
}
resource "aws_rds_cluster_instance" "this" {
identifier = "${var.service}-${var.environment}-instance"
cluster_identifier = aws_rds_cluster.this.id
instance_class = "db.serverless"
engine = aws_rds_cluster.this.engine
engine_version = aws_rds_cluster.this.engine_version
db_subnet_group_name = aws_db_subnet_group.rds.name
db_parameter_group_name = aws_db_parameter_group.this.name
performance_insights_enabled = var.environment == "prod" ? true : false
}
# Aurora Serverless - Security group
resource "aws_security_group" "rds" {
name = "${var.service}-${var.environment}-rds-sg"
vpc_id = var.vpc.id
}
# Aurora Serverless - Parameter group
resource "aws_rds_cluster_parameter_group" "this" {
name = "${var.service}-${var.environment}-cluster-parameter"
family = "aurora-postgresql16"
parameter {
apply_method = "pending-reboot"
name = "rds.logical_replication"
value = 1
}
parameter {
apply_method = "pending-reboot"
name = "aurora.enhanced_logical_replication"
value = 1
}
parameter {
apply_method = "pending-reboot"
name = "aurora.logical_replication_backup"
value = 0
}
parameter {
apply_method = "pending-reboot"
name = "aurora.logical_replication_globaldb"
value = 0
}
}
resource "aws_db_parameter_group" "this" {
name = "${var.service}-${var.environment}-instance-parameter"
family = "aurora-postgresql16"
}
# Aurora Serverless - Subnet group
resource "aws_db_subnet_group" "rds" {
name = "${var.service}-${var.environment}-rds-sg"
description = "${var.service}-${var.environment}-rds-sg"
subnet_ids = var.database_subnet_ids
}
#
# Redshift Serverless
#
# RedShift Serverless - Namesapce
resource "aws_redshiftserverless_namespace" "this" {
namespace_name = "${var.service}-${var.environment}-redshift-serverless-namespace"
db_name = "${var.service}-${var.environment}-db"
}
# RedShift Serverless - Workgroup
resource "aws_redshiftserverless_workgroup" "this" {
namespace_name = aws_redshiftserverless_namespace.this.namespace_name
workgroup_name = "${var.service}-${var.environment}-redshift-serverless-workgroup"
subnet_ids = aws_db_subnet_group.redshift.subnet_ids
security_group_ids = [aws_security_group.redshift.id]
base_capacity = 8
max_capacity = 256
enhanced_vpc_routing = true
publicly_accessible = false
port = 5439
config_parameter {
parameter_key = "auto_mv"
parameter_value = "true"
}
config_parameter {
parameter_key = "datestyle"
parameter_value = "ISO, MDY"
}
config_parameter {
parameter_key = "enable_case_sensitive_identifier"
parameter_value = "true"
}
config_parameter {
parameter_key = "enable_user_activity_logging"
parameter_value = "true"
}
config_parameter {
parameter_key = "max_query_execution_time"
parameter_value = "14400"
}
config_parameter {
parameter_key = "query_group"
parameter_value = "default"
}
config_parameter {
parameter_key = "require_ssl"
parameter_value = "true"
}
config_parameter {
parameter_key = "search_path"
parameter_value = "$user, public"
}
config_parameter {
parameter_key = "use_fips_ssl"
parameter_value = "false"
}
}
data "aws_iam_policy_document" "this" {
statement {
effect = "Allow"
principals {
type = "Service"
identifiers = ["redshift.amazonaws.com"]
}
actions = [
"redshift:AuthorizeInboundIntegration"
]
condition {
test = "StringEquals"
variable = "aws:SourceArn"
values = [aws_rds_cluster.this.arn]
}
}
statement {
effect = "Allow"
principals {
type = "AWS"
identifiers = ["arn:aws:iam::${data.aws_caller_identity.this.account_id}:root"]
}
actions = [
"redshift:CreateInboundIntegration",
]
}
}
resource "aws_redshift_resource_policy" "this" {
resource_arn = aws_redshiftserverless_namespace.this.arn
policy = data.aws_iam_policy_document.this.json
}
# RedShift Serverless - Security group
resource "aws_security_group" "redshift" {
name = "${var.service}-${var.environment}-redshift-sg"
vpc_id = var.vpc.id
}
# RedShift Serverless - Subnet group
resource "aws_db_subnet_group" "redshift" {
name = "${var.service}-${var.environment}-redshift-subnet-group"
subnet_ids = var.database_subnet_ids
}
#
# Zero-ETL intergration
#
resource "aws_rds_integration" "this" {
integration_name = "${var.service}-${var.environment}-zero-etl-intergration"
source_arn = aws_rds_cluster.this.arn
target_arn = aws_redshiftserverless_namespace.this.arn
data_filter = "include: ${var.service}_${var.environment}_db.*.*"
lifecycle {
ignore_changes = [
kms_key_id
]
}
}
コード解説
重要な部分のみ解説します。
Aurora parameter
以下の4つのパラメータが必須です。作成済みのAuroraに対して適用する場合は、手動で再起動が必要です。
# Aurora Serverless - Parameter group
resource "aws_rds_cluster_parameter_group" "this" {
name = "${var.service}-${var.environment}-cluster-parameter"
family = "aurora-postgresql16"
parameter {
apply_method = "pending-reboot"
name = "rds.logical_replication"
value = 1
}
parameter {
apply_method = "pending-reboot"
name = "aurora.enhanced_logical_replication"
value = 1
}
parameter {
apply_method = "pending-reboot"
name = "aurora.logical_replication_backup"
value = 0
}
parameter {
apply_method = "pending-reboot"
name = "aurora.logical_replication_globaldb"
value = 0
}
}
Redshift config parameter
Redshift へのパラメータ変更点です。
enable_case_sensitive_identifier = “true”がZero-ETL統合では必須です。
他のパラメータはデフォルトでも問題ないですが、terraform-aws-providerのバグ(https://github.com/pulumi/pulumi-aws/issues/4405)でデフォルトのパラメータでも明示的に指定する必要があります。指定しない場合は変更がないのにも関わらず変更を検知して、エラーとなる場合があります。
<em># RedShift Serverless - Workgroup</em>
(略)
config_parameter {
parameter_key = "auto_mv"
parameter_value = "true"
}
config_parameter {
parameter_key = "datestyle"
parameter_value = "ISO, MDY"
}
config_parameter {
parameter_key = "enable_case_sensitive_identifier"
parameter_value = "true" <em>#trueが必須</em>
}
config_parameter {
parameter_key = "enable_user_activity_logging"
parameter_value = "true"
}
config_parameter {
parameter_key = "max_query_execution_time"
parameter_value = "14400"
}
config_parameter {
parameter_key = "query_group"
parameter_value = "default"
}
config_parameter {
parameter_key = "require_ssl"
parameter_value = "true"
}
config_parameter {
parameter_key = "search_path"
parameter_value = "$user, public"
}
config_parameter {
parameter_key = "use_fips_ssl"
parameter_value = "false"
}
Redshift resource policy
AuroraにRedshift を読み込む権限と、AWSアカウント内のユーザーにETL統合を操作する権限を付与する必要があります。
リソースポリシーは、aws_redshift_serverless_resource_policyではなく、aws_redshift_resource_policyを利用することに注意してください。
data "aws_iam_policy_document" "this" {
statement {
effect = "Allow"
principals {
type = "Service"
identifiers = ["redshift.amazonaws.com"]
}
actions = [
"redshift:AuthorizeInboundIntegration"
]
condition {
test = "StringEquals"
variable = "aws:SourceArn"
values = [aws_rds_cluster.this.arn]
}
}
statement {
effect = "Allow"
principals {
type = "AWS"
identifiers = ["arn:aws:iam::${data.aws_caller_identity.this.account_id}:root"]
}
actions = [
"redshift:CreateInboundIntegration",
]
}
}
resource "aws_redshift_resource_policy" "this" {
resource_arn = aws_redshiftserverless_namespace.this.arn
policy = data.aws_iam_policy_document.this.json
}
ETL統合
以上の設定が済んだら、ETL統合を行います。お疲れ様でした。
#
# Zero-ETL intergration
#
resource "aws_rds_integration" "this" {
integration_name = "${var.service}-${var.environment}-zero-etl-intergration"
source_arn = aws_rds_cluster.this.arn
target_arn = aws_redshiftserverless_namespace.this.arn
data_filter = "include: ${var.service}_${var.environment}_db.*.*"
lifecycle {
ignore_changes = [
kms_key_id
]
}
}