Loading...
Loading...
Infrastructure-as-Code fundamentals for data engineering using Terraform to provision AWS resources (S3, EC2, IAM)
npx skill4agent add aradotso/data-skills terraform-data-engineering-iacSkill by ara.so — Data Skills collection.
# macOS
brew install terraform
# Linux
wget https://releases.hashicorp.com/terraform/1.5.0/terraform_1.5.0_linux_amd64.zip
unzip terraform_1.5.0_linux_amd64.zip
sudo mv terraform /usr/local/bin/
# Verify installation
terraform version# macOS
brew install awscli
# Linux
curl "https://awscli.amazonaws.com/awscli-exe-linux-x86_64.zip" -o "awscliv2.zip"
unzip awscliv2.zip
sudo ./aws/install
# Configure AWS credentials
aws configureAmazonS3FullAccessAmazonEC2FullAccessIAMFullAccessterraform/
├── main.tf # Main infrastructure definitions
├── variables.tf # Input variables
├── outputs.tf # Output values
└── terraform.tfstate # State file (generated)# Initialize backend and download providers
terraform -chdir=terraform init# Check syntax and validate configuration
terraform -chdir=terraform validate# Auto-format HCL files
terraform -chdir=terraform fmt# Preview what will be created/changed
terraform -chdir=terraform plan# Create or update infrastructure
terraform -chdir=terraform apply
# Auto-approve without confirmation (use carefully)
terraform -chdir=terraform apply -auto-approve# Remove all managed infrastructure
terraform -chdir=terraform destroy
# Auto-approve destruction (use carefully)
terraform -chdir=terraform destroy -auto-approve# List all resources in state
terraform -chdir=terraform state list
# Show detailed resource information
terraform -chdir=terraform state show aws_s3_bucket.data_bucket
# View state as JSON
cat terraform/terraform.tfstate | jq -r '.resources[] | [.type, .name] | join(",")'# terraform/main.tf
terraform {
required_providers {
aws = {
source = "hashicorp/aws"
version = "~> 5.0"
}
}
}
provider "aws" {
region = var.aws_region
}
resource "aws_s3_bucket" "data_lake" {
bucket = "my-unique-data-lake-bucket-${var.environment}"
tags = {
Name = "Data Lake Bucket"
Environment = var.environment
Project = "data-engineering"
}
}
resource "aws_s3_bucket_versioning" "data_lake_versioning" {
bucket = aws_s3_bucket.data_lake.id
versioning_configuration {
status = "Enabled"
}
}
resource "aws_s3_bucket_lifecycle_configuration" "data_lake_lifecycle" {
bucket = aws_s3_bucket.data_lake.id
rule {
id = "archive_old_data"
status = "Enabled"
transition {
days = 90
storage_class = "GLACIER"
}
expiration {
days = 365
}
}
}# terraform/main.tf (continued)
data "aws_ami" "ubuntu" {
most_recent = true
owners = ["099720109477"] # Canonical
filter {
name = "name"
values = ["ubuntu/images/hvm-ssd/ubuntu-jammy-22.04-amd64-server-*"]
}
}
resource "aws_instance" "data_processor" {
ami = data.aws_ami.ubuntu.id
instance_type = var.instance_type
tags = {
Name = "Data Processing Server"
Environment = var.environment
}
user_data = <<-EOF
#!/bin/bash
sudo apt-get update
sudo apt-get install -y python3-pip
pip3 install pandas boto3 apache-airflow
EOF
}
resource "aws_eip" "data_processor_eip" {
instance = aws_instance.data_processor.id
domain = "vpc"
}# terraform/variables.tf
variable "aws_region" {
description = "AWS region for resources"
type = string
default = "us-east-1"
}
variable "environment" {
description = "Environment name"
type = string
default = "dev"
}
variable "instance_type" {
description = "EC2 instance type"
type = string
default = "t3.medium"
}# terraform/outputs.tf
output "s3_bucket_name" {
description = "Name of the S3 data lake bucket"
value = aws_s3_bucket.data_lake.id
}
output "ec2_public_ip" {
description = "Public IP of data processing EC2 instance"
value = aws_eip.data_processor_eip.public_ip
}
output "ec2_instance_id" {
description = "Instance ID of data processor"
value = aws_instance.data_processor.id
}# Use workspace or separate state files
terraform workspace new staging
terraform workspace new production
# Or use variable files
terraform apply -var-file="environments/dev.tfvars"
terraform apply -var-file="environments/prod.tfvars"# terraform/backend.tf
terraform {
backend "s3" {
bucket = "my-terraform-state-bucket"
key = "data-engineering/terraform.tfstate"
region = "us-east-1"
encrypt = true
dynamodb_table = "terraform-state-lock"
}
}resource "aws_iam_role" "data_processor_role" {
name = "data-processor-role"
assume_role_policy = jsonencode({
Version = "2012-10-17"
Statement = [{
Action = "sts:AssumeRole"
Effect = "Allow"
Principal = {
Service = "ec2.amazonaws.com"
}
}]
})
}
resource "aws_iam_role_policy_attachment" "s3_access" {
role = aws_iam_role.data_processor_role.name
policy_arn = "arn:aws:iam::aws:policy/AmazonS3FullAccess"
}
resource "aws_iam_instance_profile" "data_processor_profile" {
name = "data-processor-profile"
role = aws_iam_role.data_processor_role.name
}
resource "aws_instance" "data_processor" {
ami = data.aws_ami.ubuntu.id
instance_type = var.instance_type
iam_instance_profile = aws_iam_instance_profile.data_processor_profile.name
}# List all S3 buckets
aws s3 ls
# Get bucket details
aws s3api get-bucket-location --bucket my-data-lake-bucket
# List bucket contents
aws s3 ls s3://my-data-lake-bucket/# List running instances
aws ec2 describe-instances \
--filters "Name=instance-state-name,Values=running" \
--query 'Reservations[].Instances[].{ID:InstanceId, Name:Tags[?Key==`Name`].Value, Type:InstanceType, State:State.Name, PublicIP:PublicIpAddress, PrivateIP:PrivateIpAddress}' \
--output table
# Get specific instance details
aws ec2 describe-instances --instance-ids i-1234567890abcdef0# SSH into instance (requires key pair)
ssh -i ~/.ssh/my-key.pem ubuntu@$(terraform -chdir=terraform output -raw ec2_public_ip)# Clear cache and reinitialize
rm -rf terraform/.terraform
rm terraform/.terraform.lock.hcl
terraform -chdir=terraform init# Force unlock (use with caution)
terraform -chdir=terraform force-unlock LOCK_ID# Verify AWS configuration
aws configure list
aws sts get-caller-identity
# Set credentials explicitly
export AWS_ACCESS_KEY_ID="${AWS_ACCESS_KEY_ID}"
export AWS_SECRET_ACCESS_KEY="${AWS_SECRET_ACCESS_KEY}"
export AWS_DEFAULT_REGION="us-east-1"# Import existing resource into state
terraform -chdir=terraform import aws_s3_bucket.data_lake my-existing-bucket
# Or recreate with unique name
terraform -chdir=terraform apply -var="bucket_suffix=$(date +%s)"# Test S3 permissions
aws s3 ls
# Test EC2 permissions
aws ec2 describe-instances
# Test IAM permissions
aws iam list-users# Enable debug logging
export TF_LOG=DEBUG
terraform -chdir=terraform apply
# Show detailed plan
terraform -chdir=terraform plan -out=tfplan
terraform -chdir=terraform show tfplan
# Refresh state from actual infrastructure
terraform -chdir=terraform refreshterraform plan