Loading...
Loading...
Infrastructure-as-Code patterns for data engineering using Terraform to provision AWS resources (S3, EC2, IAM)
npx skill4agent add aradotso/data-skills terraform-data-engineering-infrastructureSkill by ara.so — Data Skills collection.
# Install Terraform (macOS)
brew tap hashicorp/tap
brew install hashicorp/tap/terraform
# Install AWS CLI (macOS)
brew install awscli
# Configure AWS CLI
aws configure
# Enter your AWS Access Key ID, Secret Access Key, region, and output format# Create access keys for your IAM user
aws iam create-access-key --user-name your-terraform-user
# Configure AWS CLI with these credentials
aws configure --profile terraformterraform/
├── main.tf # Main infrastructure definitions
├── variables.tf # Input variables (if present)
├── outputs.tf # Output values (if present)
└── terraform.tfstate # State file (generated)# Initialize the working directory
terraform -chdir=terraform init
# Validate configuration files
terraform -chdir=terraform validate
# Format configuration files
terraform -chdir=terraform fmt# Preview changes without applying
terraform -chdir=terraform plan
# Apply changes and create infrastructure
terraform -chdir=terraform apply
# Apply without confirmation prompt
terraform -chdir=terraform apply -auto-approve# List all resources in state
terraform -chdir=terraform state list
# Show details of a specific resource
terraform -chdir=terraform state show aws_s3_bucket.data_bucket
# Output current state
terraform -chdir=terraform show# Destroy all managed infrastructure
terraform -chdir=terraform destroy
# Destroy specific resources
terraform -chdir=terraform destroy -target=aws_instance.data_processor# terraform/main.tf
terraform {
required_providers {
aws = {
source = "hashicorp/aws"
version = "~> 5.0"
}
}
}
provider "aws" {
region = "us-east-1"
}
# S3 bucket for raw data
resource "aws_s3_bucket" "raw_data" {
bucket = "my-unique-raw-data-bucket-12345"
tags = {
Environment = "dev"
Purpose = "data-lake-raw"
ManagedBy = "terraform"
}
}
# Enable versioning for data recovery
resource "aws_s3_bucket_versioning" "raw_data_versioning" {
bucket = aws_s3_bucket.raw_data.id
versioning_configuration {
status = "Enabled"
}
}
# Block public access
resource "aws_s3_bucket_public_access_block" "raw_data_public_access" {
bucket = aws_s3_bucket.raw_data.id
block_public_acls = true
block_public_policy = true
ignore_public_acls = true
restrict_public_buckets = true
}# Security group for EC2 instance
resource "aws_security_group" "data_processor_sg" {
name = "data-processor-sg"
description = "Security group for data processing EC2 instances"
ingress {
description = "SSH access"
from_port = 22
to_port = 22
protocol = "tcp"
cidr_blocks = ["0.0.0.0/0"] # Restrict this in production
}
egress {
from_port = 0
to_port = 0
protocol = "-1"
cidr_blocks = ["0.0.0.0/0"]
}
tags = {
Name = "data-processor-sg"
ManagedBy = "terraform"
}
}
# EC2 instance for data processing
resource "aws_instance" "data_processor" {
ami = "ami-0c55b159cbfafe1f0" # Amazon Linux 2 AMI (update for your region)
instance_type = "t3.medium"
vpc_security_group_ids = [aws_security_group.data_processor_sg.id]
iam_instance_profile = aws_iam_instance_profile.data_processor_profile.name
user_data = <<-EOF
#!/bin/bash
yum update -y
yum install -y python3 python3-pip
pip3 install boto3 pandas
EOF
tags = {
Name = "data-processor"
Environment = "dev"
ManagedBy = "terraform"
}
}# IAM role for EC2 instances
resource "aws_iam_role" "data_processor_role" {
name = "data-processor-role"
assume_role_policy = jsonencode({
Version = "2012-10-17"
Statement = [
{
Action = "sts:AssumeRole"
Effect = "Allow"
Principal = {
Service = "ec2.amazonaws.com"
}
}
]
})
tags = {
ManagedBy = "terraform"
}
}
# Policy to allow S3 access
resource "aws_iam_role_policy" "s3_access_policy" {
name = "s3-access-policy"
role = aws_iam_role.data_processor_role.id
policy = jsonencode({
Version = "2012-10-17"
Statement = [
{
Effect = "Allow"
Action = [
"s3:GetObject",
"s3:PutObject",
"s3:ListBucket"
]
Resource = [
aws_s3_bucket.raw_data.arn,
"${aws_s3_bucket.raw_data.arn}/*"
]
}
]
})
}
# Instance profile for EC2
resource "aws_iam_instance_profile" "data_processor_profile" {
name = "data-processor-profile"
role = aws_iam_role.data_processor_role.name
}# terraform/variables.tf
variable "environment" {
description = "Environment name (dev, staging, prod)"
type = string
default = "dev"
}
variable "bucket_prefix" {
description = "Prefix for S3 bucket names"
type = string
}
variable "instance_type" {
description = "EC2 instance type"
type = string
default = "t3.medium"
}
variable "aws_region" {
description = "AWS region"
type = string
default = "us-east-1"
}
# terraform/main.tf
resource "aws_s3_bucket" "data_bucket" {
bucket = "${var.bucket_prefix}-${var.environment}-data"
tags = {
Environment = var.environment
ManagedBy = "terraform"
}
}terraform -chdir=terraform apply \
-var="environment=prod" \
-var="bucket_prefix=mycompany" \
-var="instance_type=t3.large"# terraform/outputs.tf
output "s3_bucket_name" {
description = "Name of the S3 bucket"
value = aws_s3_bucket.raw_data.id
}
output "s3_bucket_arn" {
description = "ARN of the S3 bucket"
value = aws_s3_bucket.raw_data.arn
}
output "ec2_instance_id" {
description = "ID of the EC2 instance"
value = aws_instance.data_processor.id
}
output "ec2_public_ip" {
description = "Public IP of the EC2 instance"
value = aws_instance.data_processor.public_ip
}
# View outputs
# terraform -chdir=terraform output# Clone the repository
git clone https://github.com/josephmachado/iac-for-data-engineering-terraform-.git
cd iac-for-data-engineering-terraform-
# Update bucket name in terraform/main.tf to be globally unique
# Edit terraform/main.tf and change bucket name
# Initialize and apply
terraform -chdir=terraform init
terraform -chdir=terraform validate
terraform -chdir=terraform fmt
terraform -chdir=terraform apply# List S3 buckets
aws s3 ls
# Check EC2 instances
aws ec2 describe-instances \
--filters "Name=instance-state-name,Values=running" \
--query 'Reservations[].Instances[].{ID:InstanceId, Name:Tags[?Key==`Name`].Value, Type:InstanceType, State:State.Name, PublicIP:PublicIpAddress}' \
--output table
# View Terraform state
terraform -chdir=terraform state list
cat terraform/terraform.tfstate | jq -r '.resources[] | [.type, .name] | join(",")'# Edit terraform files
# Then preview changes
terraform -chdir=terraform plan
# Apply changes
terraform -chdir=terraform apply# Destroy all resources
terraform -chdir=terraform destroy
# Verify cleanup
aws s3 ls
aws ec2 describe-instances --filters "Name=instance-state-name,Values=running"# Raw data bucket
resource "aws_s3_bucket" "raw" {
bucket = "${var.bucket_prefix}-raw-${var.environment}"
tags = {
Layer = "raw"
}
}
# Processed data bucket
resource "aws_s3_bucket" "processed" {
bucket = "${var.bucket_prefix}-processed-${var.environment}"
tags = {
Layer = "processed"
}
}
# Curated data bucket
resource "aws_s3_bucket" "curated" {
bucket = "${var.bucket_prefix}-curated-${var.environment}"
tags = {
Layer = "curated"
}
}
# Lifecycle policy for raw data
resource "aws_s3_bucket_lifecycle_configuration" "raw_lifecycle" {
bucket = aws_s3_bucket.raw.id
rule {
id = "archive-old-data"
status = "Enabled"
transition {
days = 90
storage_class = "GLACIER"
}
expiration {
days = 365
}
}
}# Create S3 bucket for state
resource "aws_s3_bucket" "terraform_state" {
bucket = "my-terraform-state-bucket-12345"
lifecycle {
prevent_destroy = true
}
}
resource "aws_s3_bucket_versioning" "terraform_state_versioning" {
bucket = aws_s3_bucket.terraform_state.id
versioning_configuration {
status = "Enabled"
}
}
# Configure backend (in a separate backend.tf file)
# terraform {
# backend "s3" {
# bucket = "my-terraform-state-bucket-12345"
# key = "data-engineering/terraform.tfstate"
# region = "us-east-1"
# }
# }BucketAlreadyExists: The requested bucket name is not availablemain.tfresource "aws_s3_bucket" "data_bucket" {
bucket = "your-unique-prefix-data-bucket-${random_id.bucket_suffix.hex}"
}
resource "random_id" "bucket_suffix" {
byte_length = 4
}UnauthorizedOperationAccessDenied# Check current user identity
aws sts get-caller-identity
# Verify policies attached to user
aws iam list-attached-user-policies --user-name your-terraform-userError acquiring the state lock# Force unlock (use with caution)
terraform -chdir=terraform force-unlock LOCK_ID
# Or remove local state lock file
rm terraform/.terraform.tfstate.lock.info# Import S3 bucket
terraform -chdir=terraform import aws_s3_bucket.data_bucket my-existing-bucket-name
# Import EC2 instance
terraform -chdir=terraform import aws_instance.data_processor i-1234567890abcdef0# Refresh state to match real infrastructure
terraform -chdir=terraform refresh
# Or during plan/apply
terraform -chdir=terraform apply -refresh=truedata "aws_ami" "amazon_linux_2" {
most_recent = true
owners = ["amazon"]
filter {
name = "name"
values = ["amzn2-ami-hvm-*-x86_64-gp2"]
}
}
resource "aws_instance" "data_processor" {
ami = data.aws_ami.amazon_linux_2.id
instance_type = "t3.medium"
}.tfterraform.tfstate.terraform/terraform planapply# AWS credentials (preferred over hardcoding)
export AWS_ACCESS_KEY_ID=your_access_key
export AWS_SECRET_ACCESS_KEY=your_secret_key
export AWS_DEFAULT_REGION=us-east-1
# Terraform variables
export TF_VAR_environment=dev
export TF_VAR_bucket_prefix=mycompany
export TF_VAR_instance_type=t3.medium