Skip to content
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
26 changes: 8 additions & 18 deletions config/clusters/awi-ciroh/workshop.values.yaml
Original file line number Diff line number Diff line change
@@ -1,20 +1,10 @@
basehub:
# Enable node placeholders, to ensure capacity at first login
nodePlaceholder:
enabled: true
nodeSelector:
node.kubernetes.io/instance-type: n4-standard-16
replicas: 25
enabled: false
jupyterhub-home-nfs:
nodeSelector:
cloud.google.com/gke-nodepool: workshop-temp-nfs
tolerations:
- effect: NoSchedule
key: 2i2c.org/nfs-vm-family
operator: Equal
value: n4
gke:
volumeId: projects/ciroh-jupyterhub-423218/zones/us-central1-b/disks/home-nfs-workshop-hyperdisk
volumeId: projects/ciroh-jupyterhub-423218/zones/us-central1-b/disks/hub-nfs-homedirs-workshop
quotaEnforcer:
config:
QuotaManager:
Expand Down Expand Up @@ -42,7 +32,7 @@ basehub:
# Limit total size, to prevent pod being evicted
# with over use by any one user. This can be increased,
# but may need to increase the size of the node disk
sizeLimit: 20Gi
sizeLimit: 5Gi
profileList:
# The mem-guarantees are here so k8s doesn't schedule other pods
# on these nodes. They need to be just under total allocatable
Expand Down Expand Up @@ -147,7 +137,7 @@ basehub:
cpu_guarantee: 1.8887049999999999
cpu_limit: 15.109639999999999
node_selector:
node.kubernetes.io/instance-type: n4-standard-16
node.kubernetes.io/instance-type: n2-standard-16
- display_name: Medium
default: true
description: ~14 GB RAM, ~4 CPUs. Up to ~15 CPUs when available
Expand All @@ -158,7 +148,7 @@ basehub:
cpu_guarantee: 3.7774099999999997
cpu_limit: 15.109639999999999
node_selector:
node.kubernetes.io/instance-type: n4-standard-16
node.kubernetes.io/instance-type: n2-standard-16
- display_name: Large
description: ~30 GB RAM, ~8 CPUs .Up to ~62 CPUs when available
profile_options: *profile_options
Expand All @@ -168,7 +158,7 @@ basehub:
cpu_guarantee: 7.554819999999999
cpu_limit: 15.109639999999999
node_selector:
node.kubernetes.io/instance-type: n4-standard-16
node.kubernetes.io/instance-type: n2-standard-16
- display_name: Huge
description: ~59 GB RAM, ~16 CPUs. Up to ~62 CPUs when available
profile_options: *profile_options
Expand All @@ -180,7 +170,7 @@ basehub:
cpu_guarantee: 15.109639999999999
cpu_limit: 15.109639999999999
node_selector:
node.kubernetes.io/instance-type: n4-standard-16
node.kubernetes.io/instance-type: n2-standard-16
- display_name: Small (2i2c testing)
profile_options: *profile_options
allowed_groups:
Expand All @@ -191,7 +181,7 @@ basehub:
cpu_guarantee: 1.8887049999999999
cpu_limit: 15.109639999999999
node_selector:
node.kubernetes.io/instance-type: n4-standard-4
node.kubernetes.io/instance-type: n2-standard-4
- display_name: NVIDIA Tesla T4, ~16 GB, ~4 CPUs
description: Start a container on a dedicated node with a GPU
allowed_groups:
Expand Down
72 changes: 2 additions & 70 deletions terraform/gcp/projects/awi-ciroh.tfvars
Original file line number Diff line number Diff line change
Expand Up @@ -3,13 +3,8 @@ project_id = "ciroh-jupyterhub-423218"
zone = "us-central1-b"
region = "us-central1"
core_node_machine_type = "n2-highmem-4"
#core_node_machine_type = "n4-highmem-4"
enable_network_policy = true
enable_logging = false

core_node_boot_disk = {
#type = "hyperdisk-balanced"
}
enable_network_policy = true
enable_logging = false

enable_filestore_backups = true
filestores = {}
Expand Down Expand Up @@ -94,73 +89,10 @@ notebook_nodes = {
max : 20,
machine_type : "n2-standard-64",
},

# Workshop N4 nodes
# Designed around n4-standard-16 node configuration
# These are optimal for supporting even huge instances whilst
# Having better perf on hyperdisks than 64
# Model this on Medium profiles
# Assuming 20GiB scratch per user, 240MiB/s, 1250 IOPS
"n4-standard-4" : {
min : 0,
# Keep the numbers down, for safety!
max : 100,
machine_type : "n4-standard-4",

disk_type : "hyperdisk-balanced",

# Prefer large disks as cheap and safer
disk_size_gb : 160,

# Bump these relative to scaling from 64, as the law of large numbers is worse for smaller samples
# And the pathological best-case is objectively worse (one person using IO can only hit disk limit)
# than the n4-standard-64 case
# Minimum 3000
disk_iops : 3000,
# Limit of n4-standard-4
disk_throughput : 240
},
"n4-standard-16" : {
min : 0,
# Keep the numbers down, for safety!
max : 100,
machine_type : "n4-standard-16",

disk_type : "hyperdisk-balanced",

# Allow for 50% oversubscription (small + medium) and 100GiB for images
# i.e. X = (X_user * N_user * 1.5) + 100
disk_size_gb : 220,

# Do not compute oversubscription due to small numbers --
# Mix of smaller-than-medium profiles would disrupt this
disk_iops : 7000,
disk_throughput : 1200
},
"n4-standard-64" : {
min : 0,
# Keep the numbers down, for safety!
max : 30,
machine_type : "n4-standard-64",

disk_type : "hyperdisk-balanced",
disk_size_gb : 580,

# Limit of n4-standard-64 is 2400 MiB/s
disk_iops : 15000,
disk_throughput : 2400,
},

"gpu-t4" : {
min : 0,
max : 20,
machine_type : "n1-highmem-8",

# Use regular storage for scratch
# As this is an n1 node
disk_type : "pd-ssd",
disk_size_gb : 120,

gpu : {
enabled : true,
type : "nvidia-tesla-t4",
Expand Down
Loading