Skip to content

Openzfs smhp #622

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 10 commits into from
Apr 10, 2025
Original file line number Diff line number Diff line change
Expand Up @@ -20,7 +20,7 @@ SHARED_USER_FILE="shared_users.txt"
create_user() {
local username=$1
local uid=$2
local home=$3
local fsx_home=$3
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

should we be more distriptive? fsxl vs fsxz?


# check if username already exists
if id -u "$username" >/dev/null 2>&1; then
Expand All @@ -33,13 +33,31 @@ create_user() {
echo "UID $uid is already in use. Skipping adding user: $username..."
return
fi

# create user with uid and directory
if useradd -m $username --uid $uid -d $home --shell /bin/bash; then
echo "Created user $username with uid $uid and home $home."

# Determine home directory based on OpenZFS filesystem availability
if df -h | grep -q "/home"; then
echo "OpenZFS is mounted at /home"
local home="/home/$username"

# Create user with OpenZFS home
if useradd -m $username --uid $uid -d $home --shell /bin/bash; then
echo "Created user $username with uid $uid and home $home."

# Make sure fsxl directory still exists and is accessible
sudo mkdir -p $fsx_home
sudo chown $username:$username $fsx_home
else
echo "Failed to create user $username with uid $uid"
fi
else
echo "Failed to create user $username with uid $uid"
fi
echo "OpenZFS is not mounted. Using FSxL file system"
# create user with uid and directory
if useradd -m $username --uid $uid -d $fsx_home --shell /bin/bash; then
echo "Created user $username with uid $uid and home $home."
else
echo "Failed to create user $username with uid $uid"
fi
fi
}

main() {
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -29,6 +29,10 @@ class Config:
# requires s3 permissions to be added to cluster execution role.
enable_mount_s3 = False

# Set true if you want to use FSx OpenZFS in addition to FSxL.
enable_fsx_openzfs = False


s3_bucket = "" # required when enable_mount_s3 = True, replace with your actual data bucket name in quotes, ie. "my-dataset-bucket"

if enable_mount_s3 and not s3_bucket:
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -60,6 +60,7 @@ class ProvisioningParameters:
WORKLOAD_MANAGER_KEY: str = "workload_manager"
FSX_DNS_NAME: str = "fsx_dns_name"
FSX_MOUNT_NAME: str = "fsx_mountname"
FSX_OPENZFS_DNS_NAME: str = "fsx_openzfs_dns_name"
SLURM_CONFIGURATIONS: str = "slurm_configurations"

def __init__(self, path: str):
Expand All @@ -74,6 +75,10 @@ def workload_manager(self) -> Optional[str]:
def fsx_settings(self) -> Tuple[str, str]:
return self._params.get(ProvisioningParameters.FSX_DNS_NAME), self._params.get(ProvisioningParameters.FSX_MOUNT_NAME)

@property
def fsx_openzfs_settings(self) -> Optional[str]:
return self._params.get(ProvisioningParameters.FSX_OPENZFS_DNS_NAME)

@property
def controller_group(self) -> Optional[str]:
return self._params.get("controller_group")
Expand Down Expand Up @@ -160,6 +165,12 @@ def main(args):
print(f"Mount fsx: {fsx_dns_name}. Mount point: {fsx_mountname}")
ExecuteBashScript("./mount_fsx.sh").run(fsx_dns_name, fsx_mountname, "/fsx")

# Add FSx OpenZFS mount section
fsx_openzfs_dns_name = params.fsx_openzfs_settings
if Config.enable_fsx_openzfs and fsx_openzfs_dns_name:
print(f"Mount FSx OpenZFS: {fsx_openzfs_dns_name}. Mount point: /home")
ExecuteBashScript("./mount_fsx_openzfs.sh").run(fsx_openzfs_dns_name, "/home")

ExecuteBashScript("./add_users.sh").run()

if params.workload_manager == "slurm":
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,75 @@
#!/bin/bash

# must be run as sudo

set -x
set -e

# FSx OpenZFS Endpoints and versions
FSX_OPENZFS_DNS_NAME="$1"
OPENZFS_MOUNT_POINT="$2"
NFS_VERSION=4.2

# Ansible Version
ANSIBLE_VERSION="6.7.0"

# Function for error handling
handle_error()
{
local exit_code=$?
echo "Error occured in command: $BASH_COMMAND"
echo "Exit code: $exit_code"
exit $exit_code
}

trap handle_error ERR

# DEBUG: Verify parameters are set
verify_parameters()
{
if [ -z "$FSX_OPENZFS_DNS_NAME" ] || [ -z "$OPENZFS_MOUNT_POINT" ]; then
echo "Usage: $0 <fsx_dns_name> <mount_point>"
exit 1
fi
}

# Install Ansible and collections: Move to higher LCS once others start using Ansible too.
install_ansible()
{
apt-get update
# apt-get install -y ansible=$ANSIBLE_VERSION
apt-get install -y python3-pip
python3 -m pip install "ansible==${ANSIBLE_VERSION}"
ansible-galaxy collection install ansible.posix
}

# Install NFS Client based on OS
install_nfs_client()
{
if [ -f /etc/lsb-release ]; then
# Ubuntu
ansible localhost -b -m ansible.builtin.apt -a "name=nfs-common state=present update_cache=yes"
elif [ -f /etc/redhat-release ]; then
# CentOS/RHEL
ansible localhost -b -m ansible.builtin.yum -a "name=nfs-utils state=present"
fi
}

# Mount the FSx OpenZFS file system
mount_fs()
{
ansible localhost -b -m ansible.posix.mount -a "path=$OPENZFS_MOUNT_POINT src=$FSX_OPENZFS_DNS_NAME:/fsx fstype=nfs opts=nfsvers=$NFS_VERSION,_netdev,nconnect=16,x-systemd.automount,x-systemd.requires=network-online.target dump=0 passno=0 state=mounted"
}

main()
{
echo "Mount_fsx_openzfs called with fsx_openzfs_dns_name: $FSX_OPENZFS_DNS_NAME"
echo "Using openzfs_mount_point: $OPENZFS_MOUNT_POINT"
verify_parameters
install_ansible
install_nfs_client
mount_fs
echo "FSx OpenZFS mounted successfully to $OPENZFS_MOUNT_POINT"
}

main "$@"
Original file line number Diff line number Diff line change
@@ -1,23 +1,65 @@
#!/bin/bash

# Wait for FSx to be properly mounted (timeout after 60 seconds)
# RETRY CONFIG
ATTEMPTS=6
WAIT=10
for ((i=1; i<=ATTEMPTS; i++)); do
if mountpoint -q "/fsx" && touch /fsx/.test_write 2>/dev/null; then
rm -f /fsx/.test_write
break
FSX_OPENZFS_DNS_NAME="/home"
FSX_L_DNS_NAME="/fsx"

# Function to check mount
check_mount()
{
local mount_point="$1"
if mountpoint -q "$mount_point" && touch "$mount_point/.test_write" 2>/dev/null; then
rm -f "$mount_point/.test_write"
return 0
fi
if [ $i -eq $ATTEMPTS ]; then
echo "FSx mount not ready after $((ATTEMPTS * WAIT)) seconds"
return 1
}

# Wait for mount (both OpenZFS and FSxL)
wait_for_mount()
{
local mount_point="$1"
for ((i=1; i<=$ATTEMPTS; i++)); do
if check_mount "$mount_point"; then
echo "Successfully verified mount at $mount_point"
return 0
fi
if [ $i -eq $ATTEMPTS ]; then
echo "Mount not ready after $((ATTEMPTS * WAIT)) seconds"
return 1
fi
echo "Waiting for FSx mount: $mount_point to be ready... (attempt $i/$ATTEMPTS)"
sleep $WAIT
done
}

# Check if OpenZFS is mounted
if wait_for_mount "$FSX_OPENZFS_DNS_NAME"; then
echo "OpenZFS is mounted at $FSX_OPENZFS_DNS_NAME"
if [ -d "$FSX_OPENZFS_DNS_NAME" ]; then
# Set home directory to /home/ubuntu
sudo usermod -m -d "$FSX_OPENZFS_DNS_NAME/ubuntu" ubuntu
echo "Home directory set to $FSX_OPENZFS_DNS_NAME/ubuntu"

# Maintain access to /fsx/ubuntu
if wait_for_mount "$FSX_L_DNS_NAME"; then
sudo mkdir -p "$FSX_L_DNS_NAME/ubuntu"
sudo chown ubuntu:ubuntu "$FSX_L_DNS_NAME/ubuntu"
else
echo "Warning: FSx mount not available, skipping $FSX_L_DNS_NAME/ubuntu setup"
fi
fi
else
echo "OpenZFS is not mounted. Using FSxL file system as home"
if ! wait_for_mount "$FSX_L_DNS_NAME"; then
echo "Warning: FSx mount not available. Exiting."
exit 1
fi
sleep $WAIT
done

# move the ubuntu user to the shared /fsx filesystem
if [ -d "/fsx/ubuntu" ]; then
sudo usermod -d /fsx/ubuntu ubuntu
elif [ -d "/fsx" ]; then
sudo usermod -m -d /fsx/ubuntu ubuntu
fi
if [ -d "$FSX_L_DNS_NAME/ubuntu" ]; then
sudo usermod -d "$FSX_L_DNS_NAME/ubuntu" ubuntu
elif [ -d "$FSX_L_DNS_NAME" ]; then
sudo usermod -m -d "$FSX_L_DNS_NAME/ubuntu" ubuntu
fi
fi
Original file line number Diff line number Diff line change
Expand Up @@ -359,6 +359,39 @@ setup_lifecycle_scripts() {
echo -e "${BLUE}Continuing with Neuron disabled in LCS...${NC}"
fi

# Check if FSx OpenZFS was deployed in the stack
echo -e "${BLUE}Checking if FSx OpenZFS was deployed in the stack...${NC}"

export ENABLE_FSX_OPENZFS="false"

FSX_OPENZFS_DNS=$(aws cloudformation describe-stacks \
--stack-name "${STACK_ID_VPC}" \
--query 'Stacks[0].Outputs[?OutputKey==`FSxOpenZFSFileSystemDNSname`].OutputValue' \
--output text)

if [ -n "$FSX_OPENZFS_DNS" ]; then
echo -e "${BLUE}FSx OpenZFS detected in stack. DNS: ${FSX_OPENZFS_DNS}${NC}"
echo -e "${BLUE}Enabling FSx OpenZFS in LCS...${NC}"

# Get the FSx OpenZFS File System ID as well
FSX_OPENZFS_ID=$(aws cloudformation describe-stacks \
--stack-name "${STACK_ID_VPC}" \
--query 'Stacks[0].Outputs[?OutputKey==`FSxOpenZFSFileSystemId`].OutputValue' \
--output text)

ENABLE_FSX_OPENZFS="true"
echo "export FSX_OPENZFS_DNS=${FSX_OPENZFS_DNS}" >> env_vars
echo "export FSX_OPENZFS_ID=${FSX_OPENZFS_ID}" >> env_vars

# Update config.py
sed -i.bak 's/enable_fsx_openzfs = False/enable_fsx_openzfs = True/' base-config/config.py
rm base-config/config.py.bak

echo -e "${GREEN}✅ Lifecycle Scripts modified successfully! FSx OpenZFS enabled in config.py${NC}"
else
echo -e "${BLUE}No FSx OpenZFS detected in stack. Continuing with FSx OpenZFS disabled in LCS...${NC}"
fi

echo -e "${YELLOW}Did you deploy the optional hyperpod-observability CloudFormation stack? (yes/no)${NC}"
read -e DEPLOYED_OBSERVABILITY

Expand Down Expand Up @@ -743,6 +776,14 @@ EOL
WORKER_GROUPS+="
]"

# OpenZFS
if [[ $ENABLE_FSX_OPENZFS == "true" ]]; then
FSX_OPENZFS_CONFIG="
\"fsx_openzfs_dns_name\": \"${FSX_OPENZFS_ID}.fsx.${AWS_REGION}.amazonaws.com\""
else
FSX_OPENZFS_CONFIG=""
fi

#MH
if [[ $MH == "true" ]]; then
SLURM_CONFIGURATIONS="
Expand All @@ -765,7 +806,7 @@ EOL
"login_group": "login-group",
"worker_groups": $WORKER_GROUPS,
"fsx_dns_name": "${FSX_ID}.fsx.${AWS_REGION}.amazonaws.com",
"fsx_mountname": "${FSX_MOUNTNAME}",
"fsx_mountname": "${FSX_MOUNTNAME}",${FSX_OPENZFS_CONFIG},
"slurm_configurations": $SLURM_CONFIGURATIONS
}
EOL
Expand All @@ -778,7 +819,7 @@ EOL
"login_group": "login-group",
"worker_groups": $WORKER_GROUPS,
"fsx_dns_name": "${FSX_ID}.fsx.${AWS_REGION}.amazonaws.com",
"fsx_mountname": "${FSX_MOUNTNAME}"
"fsx_mountname": "${FSX_MOUNTNAME}",${FSX_OPENZFS_CONFIG}
}
EOL
fi
Expand All @@ -791,7 +832,7 @@ EOL
"controller_group": "$CONTROLLER_NAME",
"worker_groups": $WORKER_GROUPS,
"fsx_dns_name": "${FSX_ID}.fsx.${AWS_REGION}.amazonaws.com",
"fsx_mountname": "${FSX_MOUNTNAME}",
"fsx_mountname": "${FSX_MOUNTNAME}",${FSX_OPENZFS_CONFIG},
"slurm_configurations": $SLURM_CONFIGURATIONS
}
EOL
Expand All @@ -803,7 +844,7 @@ EOL
"controller_group": "$CONTROLLER_NAME",
"worker_groups": $WORKER_GROUPS,
"fsx_dns_name": "${FSX_ID}.fsx.${AWS_REGION}.amazonaws.com",
"fsx_mountname": "${FSX_MOUNTNAME}"
"fsx_mountname": "${FSX_MOUNTNAME}",${FSX_OPENZFS_CONFIG}
}
EOL
fi
Expand Down
Loading