Skip to content

Commit 2c885c8

Browse files
committed
init
0 parents  commit 2c885c8

File tree

8 files changed

+391
-0
lines changed

8 files changed

+391
-0
lines changed

.gitignore

Lines changed: 67 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,67 @@
1+
# Local .terraform directories
2+
**/.terraform/*
3+
4+
# .tfstate files
5+
*.tfstate
6+
*.tfstate.*
7+
8+
# Crash log files
9+
crash.log
10+
crash.*.log
11+
12+
# Exclude all .tfvars files, which are likely to contain sensitive data, such as
13+
# password, private keys, and other secrets. These should not be part of version
14+
# control as they are data points which are potentially sensitive and subject
15+
# to change depending on the environment.
16+
*.tfvars
17+
*.tfvars.json
18+
19+
# Ignore override files as they are usually used to override resources locally and so
20+
# are not checked in
21+
override.tf
22+
override.tf.json
23+
*_override.tf
24+
*_override.tf.json
25+
26+
# Include override files you do wish to add to version control using negated pattern
27+
# !example_override.tf
28+
29+
# Include tfplan files to ignore the plan output of command: terraform plan -out=tfplan
30+
# example: *tfplan*
31+
32+
# Ignore CLI configuration files
33+
.terraformrc
34+
terraform.rc
35+
36+
# local development
37+
.DS_Store
38+
tmp
39+
.vscode
40+
41+
# Terragrunt
42+
.*.sw?
43+
.idea
44+
terragrunt.iml
45+
vendor
46+
.terraform
47+
*.out
48+
.terragrunt-cache
49+
.bundle
50+
.ruby-version
51+
.terraform.lock.hcl
52+
terragrunt
53+
secrets.hcl
54+
.*.sw?
55+
.idea
56+
terragrunt.iml
57+
vendor
58+
.terraform
59+
.vscode
60+
*.tfstate
61+
*.tfstate.backup
62+
*.out
63+
.terragrunt-cache
64+
.bundle
65+
.ruby-version
66+
.terraform.lock.hcl
67+
terragrunt

LICENSE

Lines changed: 24 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,24 @@
1+
This is free and unencumbered software released into the public domain.
2+
3+
Anyone is free to copy, modify, publish, use, compile, sell, or
4+
distribute this software, either in source code form or as a compiled
5+
binary, for any purpose, commercial or non-commercial, and by any
6+
means.
7+
8+
In jurisdictions that recognize copyright laws, the author or authors
9+
of this software dedicate any and all copyright interest in the
10+
software to the public domain. We make this dedication for the benefit
11+
of the public at large and to the detriment of our heirs and
12+
successors. We intend this dedication to be an overt act of
13+
relinquishment in perpetuity of all present and future rights to this
14+
software under copyright law.
15+
16+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
17+
EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
18+
MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
19+
IN NO EVENT SHALL THE AUTHORS BE LIABLE FOR ANY CLAIM, DAMAGES OR
20+
OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
21+
ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
22+
OTHER DEALINGS IN THE SOFTWARE.
23+
24+
For more information, please refer to <https://unlicense.org>

README.md

Lines changed: 40 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,40 @@
1+
## EKS Mixed cluster for AI pipelines and applications
2+
3+
Bare bones mixed GPU eks cluster.
4+
- Managed node groups
5+
- Default no schedule taints for GPU nodes
6+
- Karpenter annotations
7+
8+
#### Inputs
9+
```hcl
10+
cluster_name
11+
cluster_version
12+
auth_users
13+
ssh_key_name
14+
region
15+
vpc_id
16+
vpc_subnet_ids
17+
vpc_default_security_group_ids
18+
vpc_controlplane_subnet_ids
19+
node_instance_types
20+
node_min_size
21+
node_max_size
22+
node_desired_size
23+
enable_gpu_nodes
24+
gpu_node_instance_types
25+
gpu_min_nodes
26+
gpu_max_nodes
27+
gpu_desired_nodes
28+
karpenter_enable
29+
karpenter_role_arn
30+
enable_monitoring
31+
default_tags
32+
```
33+
34+
#### Outputs
35+
```hcl
36+
cluster_name
37+
cluster_endpoint
38+
cluster_certificate_authority_data
39+
cluster_oidc_provider_arn
40+
```

main.tf

Lines changed: 93 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,93 @@
1+
data "aws_caller_identity" "current" {}
2+
3+
locals {
4+
remote_access = {
5+
ec2_ssh_key = var.ssh_key_name
6+
source_security_group_ids = [aws_security_group.remote_access.id]
7+
}
8+
default_node_group = {
9+
default = {
10+
capacity_type = "ON_DEMAND"
11+
instance_types = var.node_instance_types
12+
min_size = var.node_min_size
13+
max_size = var.node_max_size
14+
desired_size = var.node_desired_size
15+
}
16+
}
17+
gpu_node_group = var.enable_gpu_nodes ? {
18+
gpu = {
19+
instance_types = var.gpu_node_instance_types
20+
capacity_type = "SPOT"
21+
min_size = var.gpu_min_nodes
22+
max_size = var.gpu_max_nodes
23+
desired_size = var.gpu_desired_nodes
24+
taints = {
25+
dedicated = {
26+
key = "dedicated"
27+
value = "gpuGroup"
28+
effect = "NO_SCHEDULE"
29+
}
30+
}
31+
remote_access = local.remote_access
32+
}
33+
} : {}
34+
merged_node_groups = merge(local.default_node_group, local.gpu_node_group)
35+
}
36+
37+
module "eks" {
38+
source = "terraform-aws-modules/eks/aws"
39+
version = "~> 19.17.2"
40+
41+
cluster_name = var.cluster_name
42+
cluster_endpoint_public_access = true
43+
44+
vpc_id = var.vpc_id
45+
subnet_ids = var.vpc_subnet_ids
46+
control_plane_subnet_ids = var.vpc_controlplane_subnet_ids
47+
48+
create_aws_auth_configmap = true
49+
manage_aws_auth_configmap = true
50+
51+
aws_auth_users = var.auth_users
52+
aws_auth_roles = [
53+
# We need to add in the Karpenter node IAM role for nodes launched by Karpenter
54+
{
55+
rolearn = var.karpenter_role_arn
56+
username = "system:node:{{EC2PrivateDNSName}}"
57+
groups = [
58+
"system:bootstrappers",
59+
"system:nodes",
60+
]
61+
},
62+
]
63+
64+
eks_managed_node_groups = local.merged_node_groups
65+
tags = merge(var.default_tags, {
66+
"karpenter.sh/discovery" = var.cluster_name
67+
})
68+
}
69+
70+
71+
resource "aws_security_group" "remote_access" {
72+
name_prefix = "${var.cluster_name}-remote-access"
73+
description = "Allow remote SSH access"
74+
vpc_id = var.vpc_id
75+
76+
ingress {
77+
description = "SSH access"
78+
from_port = 22
79+
to_port = 22
80+
protocol = "tcp"
81+
cidr_blocks = ["10.0.0.0/8"]
82+
}
83+
84+
egress {
85+
from_port = 0
86+
to_port = 0
87+
protocol = "-1"
88+
cidr_blocks = ["0.0.0.0/0"]
89+
ipv6_cidr_blocks = ["::/0"]
90+
}
91+
92+
tags = var.default_tags
93+
}

outputs.tf

Lines changed: 20 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,20 @@
1+
output "cluster_name" {
2+
description = "The name of the EKS cluster."
3+
value = module.eks.cluster_name
4+
}
5+
6+
output "cluster_endpoint" {
7+
description = "The endpoint for the EKS cluster."
8+
value = module.eks.cluster_endpoint
9+
}
10+
11+
output "cluster_certificate_authority_data" {
12+
description = "The certificate-authority-data for the EKS cluster."
13+
value = module.eks.cluster_certificate_authority_data
14+
sensitive = true
15+
}
16+
17+
output "cluster_oidc_provider_arn" {
18+
description = "The OIDC Issuer ARN for the EKS cluster."
19+
value = module.eks.oidc_provider_arn
20+
}

providers.tf

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,3 @@
1+
provider "aws" {
2+
region = "us-east-1"
3+
}

variables.tf

Lines changed: 134 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,134 @@
1+
variable "cluster_name" {
2+
description = "The name of the EKS cluster."
3+
type = string
4+
}
5+
6+
variable "cluster_version" {
7+
description = "The Kubernetes server version for the EKS cluster."
8+
type = string
9+
default = "1.28"
10+
}
11+
12+
variable "auth_users" {
13+
description = "The list of users to add to the aws-auth configmap."
14+
type = list(object({
15+
userarn = string
16+
username = string
17+
groups = list(string)
18+
}))
19+
default = []
20+
}
21+
22+
variable "ssh_key_name" {
23+
description = "The name of the SSH keypair to use for the bastion host."
24+
type = string
25+
default = "sandbox_key"
26+
}
27+
28+
variable "region" {
29+
description = "The region in which to create the EKS cluster."
30+
type = string
31+
default = "us-east-1"
32+
}
33+
34+
variable "vpc_id" {
35+
description = "The VPC ID in which to create the EKS cluster."
36+
type = string
37+
}
38+
39+
variable "vpc_subnet_ids" {
40+
description = "The subnet IDs in which to create the EKS cluster."
41+
type = list(string)
42+
}
43+
44+
variable "vpc_default_security_group_ids" {
45+
description = "The ID of the default security group for the VPC."
46+
type = list(string)
47+
default = []
48+
}
49+
50+
variable "vpc_controlplane_subnet_ids" {
51+
description = "The intra subnet IDs in which to create the EKS cluster."
52+
type = list(string)
53+
}
54+
55+
variable "default_tags" {
56+
description = "The tags to apply to all resources in the module."
57+
type = map(string)
58+
default = {
59+
Terraform = "true"
60+
Environment = "sandbox"
61+
}
62+
}
63+
64+
variable "node_instance_types" {
65+
description = "The instance types for the default node group."
66+
type = list(string)
67+
default = ["t3.medium"]
68+
}
69+
70+
variable "node_min_size" {
71+
description = "The minimum number of nodes."
72+
type = number
73+
default = 1
74+
}
75+
76+
variable "node_max_size" {
77+
description = "The maximum number of nodes."
78+
type = number
79+
default = 3
80+
}
81+
82+
variable "node_desired_size" {
83+
description = "The desired number of nodes."
84+
type = number
85+
default = 1
86+
}
87+
88+
variable "enable_gpu_nodes" {
89+
description = "Whether to enable GPU nodes."
90+
type = bool
91+
default = false
92+
}
93+
94+
variable "gpu_node_instance_types" {
95+
description = "The instance types for the default GPU node group."
96+
type = list(string)
97+
default = ["g4dn.xlarge"]
98+
}
99+
100+
variable "gpu_min_nodes" {
101+
description = "The minimum number of GPU nodes."
102+
type = number
103+
default = 0
104+
}
105+
106+
variable "gpu_max_nodes" {
107+
description = "The maximum number of GPU nodes."
108+
type = number
109+
default = 3
110+
}
111+
112+
variable "gpu_desired_nodes" {
113+
description = "The desired number of GPU nodes."
114+
type = number
115+
default = 0
116+
}
117+
118+
variable "karpenter_enable" {
119+
description = "Whether to enable Karpenter."
120+
type = bool
121+
default = false
122+
}
123+
124+
variable "karpenter_role_arn" {
125+
description = "The ARN of the IAM role for Karpenter."
126+
type = string
127+
default = ""
128+
}
129+
130+
variable "enable_monitoring" {
131+
description = "Whether to enable monitoring."
132+
type = bool
133+
default = false
134+
}

versions.tf

Lines changed: 10 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,10 @@
1+
terraform {
2+
required_version = ">= 1.5"
3+
4+
required_providers {
5+
aws = {
6+
source = "hashicorp/aws"
7+
version = ">= 4.57"
8+
}
9+
}
10+
}

0 commit comments

Comments
 (0)