Habana Gaudi AWS DL1 Base Example (#30)

wsfowler · web-flow · commit 35aa7e2c22e8 · 2024-05-07T16:08:01.000-05:00
* Initial commit

* Update cloudinit flow

* Update cloud-init flow

* Updated cloud-init

* Update Gaudi example

* Updated readme

* Remove older files

* Delete examples/gen-ai-demo/main.tf

* Add Habana links to readme
diff --git a/examples/gen-ai-gaudi-base/README.md b/examples/gen-ai-gaudi-base/README.md
@@ -0,0 +1,125 @@
+<p align="center">
+  <img src="https://github.com/intel/terraform-intel-aws-vm/blob/main/images/logo-classicblue-800px.png?raw=true" alt="Intel Logo" width="250"/>
+</p>
+
+# Intel® Optimized Cloud Modules for Terraform
+
+© Copyright 2024, Intel Corporation
+
+## AWS DL1 EC2 Instance with Intel Gaudi Accelerators
+
+This demo will showcase Large Language Model(LLM) inference using Intel Gaudi AI Accelerators. This module will install the base software required to run other examples.
+
+## Usage
+
+### variables.tf
+
+Modify the region to target a specific AWS Region
+
+```hcl
+variable "region" {
+  description = "Target AWS region to deploy EC2 in."
+  type        = string
+  default     = "us-east-1"
+}
+```
+
+### main.tf
+
+Modify settings in this file to choose your AMI as well as other details around the instance that will be created. This demo was tested on Ubuntu 22.04.
+
+```hcl
+## Get latest Ubuntu 22.04 AMI in AWS for x86
+data "aws_ami" "ubuntu-linux-2204" {
+  most_recent = true
+  owners      = ["099720109477"] # Canonical
+  filter {
+    name   = "name"
+    values = ["ubuntu/images/hvm-ssd/ubuntu-jammy-22.04-amd64-server-*"]
+  }
+  filter {
+    name   = "virtualization-type"
+    values = ["hvm"]
+  }
+}
+
+module "ec2-vm" {
+  source            = "intel/aws-vm/intel"
+  key_name          = aws_key_pair.TF_key.key_name
+  instance_type     = "dl1.24xlarge"
+  availability_zone = "us-east-1a"
+  ami               = data.aws_ami.ubuntu-linux-2204.id
+  user_data         = data.cloudinit_config.ansible.rendered
+
+  root_block_device = [{
+    volume_size = "100"
+  }]
+
+  tags = {
+    Name     = "my-test-vm-${random_id.rid.dec}"
+    Owner    = "OwnerName-${random_id.rid.dec}",
+    Duration = "2"
+  }
+}
+```
+
+Run the Terraform Commands below to deploy the demos.
+
+```Shell
+terraform init
+terraform plan
+terraform apply
+```
+
+## Running the Demo using AWS CloudShell
+
+Open your AWS account and click the Cloudshell prompt
+At the command prompt enter in in these command prompts to install Terraform into the AWS Cloudshell
+
+```Shell
+git clone https://github.com/tfutils/tfenv.git ~/.tfenv
+mkdir ~/bin
+ln -s ~/.tfenv/bin/* ~/bin/
+tfenv install 1.3.0
+tfenv use 1.3.0
+```
+
+Download and run the [Gen-AI-Gaudi-Demo](https://github.com/intel/terraform-intel-aws-vm/tree/main/examples/gen-ai-gaudi-base) Terraform Module by typing this command
+
+```Shell
+git clone https://github.com/intel/terraform-intel-aws-vm.git
+```
+
+Change into the `examples/gen-ai-gaudi-base` example folder
+
+```Shell
+cd terraform-intel-aws-vm/examples/gen-ai-gaudi-demo
+```
+
+Run the Terraform Commands below to deploy the demos.
+
+```Shell
+terraform init
+terraform plan
+terraform apply
+```
+
+After the Terraform module successfully creates the EC2 instance, **wait ~15 minutes** for the recipe to download/install the Intel Gaudi driver and software. After the deployment is done, you can launch the Habana Gaudi PyTorch container using the following:
+
+```bash
+sudo docker run -it --runtime=habana -e HABANA_VISIBLE_DEVICES=all -e OMPI_MCA_btl_vader_single_copy_mechanism=none --cap-add=sys_nice --net=host --ipc=host vault.habana.ai/gaudi-docker/1.15.1/ubuntu22.04/habanalabs/pytorch-installer-2.2.0:latest
+```
+
+## Deleting the Demo
+
+To delete the demo, run `terraform destroy` to delete all resources created.
+
+## Considerations
+
+- The AWS region where this example is run should have a default VPC
+
+## Links
+
+[Intel® Gaudi® AI Accelerator](https://www.intel.com/content/www/us/en/products/details/processors/ai-accelerators/gaudi-overview.html)
+
+[Intel® Gaudi® AI Accelerator - Developer Website](https://developer.habana.ai/)
diff --git a/examples/gen-ai-gaudi-base/cloud_init.yml b/examples/gen-ai-gaudi-base/cloud_init.yml
@@ -0,0 +1,9 @@
+#cloud-config
+package_update: true
+package_upgrade: true
+
+runcmd:
+  - apt install git ansible docker.io -y
+  - git clone https://github.com/intel/optimized-cloud-recipes.git /opt/optimized-cloud-recipes
+  - echo "@reboot ansible-playbook /opt/optimized-cloud-recipes/recipes/ai-gaudi-ubuntu/recipe.yml" | crontab - 
+  - reboot
diff --git a/examples/gen-ai-gaudi-base/main.tf b/examples/gen-ai-gaudi-base/main.tf
@@ -1,4 +1,4 @@
-# Provision EC2 Instance on Icelake on Amazon Linux OS in default vpc. It is configured to create the EC2 in
+# Provision EC2 DL1 Instance on Ubuntu Linux OS in default vpc. It is configured to create the EC2 in
 # US-East-1 region. The region is provided in variables.tf in this example folder.
 
 # This example also create an EC2 key pair. Associate the public key with the EC2 instance. Create the private key
@@ -80,7 +80,7 @@ module "ec2-vm" {
   count = var.vm_count
   source            = "intel/aws-vm/intel"
   key_name          = aws_key_pair.TF_key.key_name
-  instance_type     = "m7i.4xlarge"
+  instance_type     = "dl1.24xlarge"
   availability_zone = "us-east-1c"
   ami               = data.aws_ami.ubuntu-linux-2204.id
   user_data         = data.cloudinit_config.ansible.rendered
diff --git a/examples/gen-ai-gaudi-base/outputs.tf b/examples/gen-ai-gaudi-base/outputs.tf
@@ -0,0 +1,113 @@
+output "id" {
+  description = "The ID of the instance"
+  value       = try(module.ec2-vm.*.id, module.ec2-vm.*.id, "")
+}
+
+output "arn" {
+  description = "The ARN of the instance"
+  value       = try(module.ec2-vm.*.arn, "")
+}
+
+output "capacity_reservation_specification" {
+  description = "Capacity reservation specification of the instance"
+  value       = try(module.ec2-vm.*.capacity_reservation_specification, "")
+}
+
+output "instance_state" {
+  description = "The state of the instance. One of: `pending`, `running`, `shutting-down`, `terminated`, `stopping`, `stopped`"
+  value       = try(module.ec2-vm.*.instance_state, "")
+}
+
+output "outpost_arn" {
+  description = "The ARN of the Outpost the instance is assigned to"
+  value       = try(module.ec2-vm.*.outpost_arn, "")
+}
+
+output "password_data" {
+  description = "Base-64 encoded encrypted password data for the instance. Useful for getting the administrator password for instances running Microsoft Windows. This attribute is only exported if `get_password_data` is true"
+  value       = try(module.ec2-vm.*.password_data, "")
+}
+
+output "primary_network_interface_id" {
+  description = "The ID of the instance's primary network interface"
+  value       = try(module.ec2-vm.*.primary_network_interface_id, "")
+}
+
+output "private_dns" {
+  description = "The private DNS name assigned to the instance. Can only be used inside the Amazon EC2, and only available if you've enabled DNS hostnames for your VPC"
+  value       = try(module.ec2-vm.*.private_dns, "")
+}
+
+output "public_dns" {
+  description = "The public DNS name assigned to the instance. For EC2-VPC, this is only available if you've enabled DNS hostnames for your VPC"
+  value       = try(module.ec2-vm.*.public_dns, "")
+}
+
+output "public_ip" {
+  description = "The public IP address assigned to the instance, if applicable. NOTE: If you are using an aws_eip with your instance, you should refer to the EIP's address directly and not use `public_ip` as this field will change after the EIP is attached"
+  value       = try(module.ec2-vm.*.public_ip, "")
+}
+
+output "private_ip" {
+  description = "The private IP address assigned to the instance."
+  value       = try(module.ec2-vm.*.private_ip, "")
+}
+
+output "ipv6_addresses" {
+  description = "The IPv6 address assigned to the instance, if applicable."
+  value       = try(module.ec2-vm.*.ipv6_addresses, [])
+}
+
+output "tags_all" {
+  description = "A map of tags assigned to the resource, including those inherited from the provider default_tags configuration block"
+  value       = try(module.ec2-vm.*.tags_all, {})
+}
+
+output "spot_bid_status" {
+  description = "The current bid status of the Spot Instance Request"
+  value       = try(module.ec2-vm.*.spot_bid_status, "")
+}
+
+output "spot_request_state" {
+  description = "The current request state of the Spot Instance Request"
+  value       = try(module.ec2-vm.*.spot_request_state, "")
+}
+
+output "spot_instance_id" {
+  description = "The Instance ID (if any) that is currently fulfilling the Spot Instance request"
+  value       = try(module.ec2-vm.*.spot_instance_id, "")
+}
+
+################################################################################
+# IAM Role / Instance Profile
+################################################################################
+
+output "iam_role_name" {
+  description = "The name of the IAM role"
+  value       = try(module.ec2-vm.*.aws_iam_role.name, null)
+}
+
+output "iam_role_arn" {
+  description = "The Amazon Resource Name (ARN) specifying the IAM role"
+  value       = try(module.ec2-vm.*.aws_iam_role.arn, null)
+}
+
+output "iam_role_unique_id" {
+  description = "Stable and unique string identifying the IAM role"
+  value       = try(module.ec2-vm.*.aws_iam_role.unique_id, null)
+}
+
+output "iam_instance_profile_arn" {
+  description = "ARN assigned by AWS to the instance profile"
+  value       = try(module.ec2-vm.*.aws_iam_instance_profile.arn, null)
+}
+
+output "iam_instance_profile_id" {
+  description = "Instance profile's ID"
+  value       = try(module.ec2-vm.*.aws_iam_instance_profile.id, null)
+}
+
+output "iam_instance_profile_unique" {
+  description = "Stable and unique string identifying the IAM instance profile"
+  value       = try(module.ec2-vm.*.aws_iam_instance_profile.unique_id, null)
+}
diff --git a/examples/gen-ai-gaudi-base/providers.tf b/examples/gen-ai-gaudi-base/providers.tf
@@ -0,0 +1,4 @@
+provider "aws" {
+  # Environment Variables used for Authentication
+  region = var.region
+}
diff --git a/examples/gen-ai-gaudi-base/variables.tf b/examples/gen-ai-gaudi-base/variables.tf
@@ -0,0 +1,50 @@
+variable "region" {
+  description = "Target AWS region to deploy EC2 in."
+  type        = string
+  default     = "us-east-1"
+}
+
+# Variable to add ingress rules to the security group. Replace the default values with the required ports and CIDR ranges.
+variable "ingress_rules" {
+  type = list(object({
+    from_port   = number
+    to_port     = number
+    protocol    = string
+    cidr_blocks = string
+  }))
+  default = [
+    {
+      from_port   = 22
+      to_port     = 22
+      protocol    = "tcp"
+      cidr_blocks = "0.0.0.0/0"
+      
+    },
+    {
+      from_port   = 7860
+      to_port     = 7860
+      protocol    = "tcp"
+      cidr_blocks = "0.0.0.0/0"
+      
+    },
+    {
+      from_port   = 5000
+      to_port     = 5000
+      protocol    = "tcp"
+      cidr_blocks = "0.0.0.0/0"
+    },
+    {
+      from_port   = 5001
+      to_port     = 5001
+      protocol    = "tcp"
+      cidr_blocks = "0.0.0.0/0"
+    }
+  ]
+}
+
+# Variable for how many VMs to build
+variable "vm_count" {
+  description = "Number of VMs to build."
+  type        = number
+  default     = 1
+}
diff --git a/examples/gen-ai-gaudi-base/versions.tf b/examples/gen-ai-gaudi-base/versions.tf
@@ -0,0 +1,13 @@
+terraform {
+  required_version = ">=1.3.0"
+  required_providers {
+    aws = {
+      source  = "hashicorp/aws"
+      version = "~> 5.31"
+    }
+    cloudinit = {
+      source  = "hashicorp/cloudinit"
+      version = ">=2.2.0"
+    }
+  }
+}