Skip to content

Commit 0511f69

Browse files
authored
aravneel update lcs - openzfs mount and user add permissions (#899)
* update openzfs mounting logic * adding forceful creation of symlink for .ssh * Added chown to the user for .ssh * updating information for cluster user config based on openzfs present * updated to include users other than ubuntu * fixed relative path for shared_users.txt * Adding xargs to strip carriage returns * fixed the ownership of symlink .ssh dir * Updated to include user flag for login * Fixing race condition during file access testing for fsx lustre and openzfs
1 parent 8d9c95e commit 0511f69

File tree

6 files changed

+377
-110
lines changed

6 files changed

+377
-110
lines changed

1.architectures/5.sagemaker-hyperpod/LifecycleScripts/base-config/mount_fsx.sh

Lines changed: 4 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -166,11 +166,13 @@ mount_fs() {
166166
local max_attempts=5
167167
local attempt=1
168168
local delay=5
169+
local test_file="$MOUNT_POINT/test_file_$(hostname)"
169170

170171
echo "[INFO] Ensuring $MOUNT_POINT directory exists..."
171172
ansible localhost -b -m ansible.builtin.file -a "path=$MOUNT_POINT state=directory" || true
172173

173174
echo "[INFO] Mounting FSx Lustre on $MOUNT_POINT..."
175+
echo "[INFO] Using test file: $test_file"
174176

175177
while (( attempt <= max_attempts )); do
176178
echo "============================"
@@ -193,13 +195,13 @@ mount_fs() {
193195
ls -la "$MOUNT_POINT" >/dev/null 2>&1 || true
194196

195197
echo "[STEP] Testing file access (touch)..."
196-
if ! ansible localhost -b -m ansible.builtin.file -a "path=$MOUNT_POINT/test_file state=touch"; then
198+
if ! ansible localhost -b -m ansible.builtin.file -a "path=$test_file state=touch"; then
197199
echo "[WARN] Touch failed — retrying in $delay seconds"
198200
sleep "$delay"; ((attempt++)); continue
199201
fi
200202

201203
echo "[STEP] Testing file access (delete)..."
202-
if ! ansible localhost -b -m ansible.builtin.file -a "path=$MOUNT_POINT/test_file state=absent"; then
204+
if ! ansible localhost -b -m ansible.builtin.file -a "path=$test_file state=absent"; then
203205
echo "[WARN] Delete failed — retrying in $delay seconds"
204206
sleep "$delay"; ((attempt++)); continue
205207
fi

1.architectures/5.sagemaker-hyperpod/LifecycleScripts/base-config/mount_fsx_openzfs.sh

Lines changed: 57 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -76,21 +76,65 @@ install_nfs_client()
7676
# Mount the FSx OpenZFS file system
7777
mount_fs()
7878
{
79-
# Create mount point directory if it doesn't exist
80-
if [ ! -d "$OPENZFS_MOUNT_POINT" ]; then
81-
mkdir -p "$OPENZFS_MOUNT_POINT"
82-
fi
79+
local max_attempts=5
80+
local attempt=1
81+
local delay=5
82+
local test_file="$OPENZFS_MOUNT_POINT/test_file_$(hostname)"
83+
84+
echo "[INFO] Ensuring $OPENZFS_MOUNT_POINT directory exists..."
85+
ansible localhost -b -m ansible.builtin.file -a "path=$OPENZFS_MOUNT_POINT state=directory" || true
86+
87+
echo "[INFO] Mounting FSx OpenZFS on $OPENZFS_MOUNT_POINT..."
88+
echo "[INFO] Using test file: $test_file"
89+
90+
while (( attempt <= max_attempts )); do
91+
echo "============================"
92+
echo "[INFO] Attempt $attempt of $max_attempts"
93+
echo "============================"
94+
95+
echo "[STEP] Mounting FSx OpenZFS..."
96+
if ! ansible localhost -b -m ansible.posix.mount -a \
97+
"path=$OPENZFS_MOUNT_POINT src=$FSX_OPENZFS_DNS_NAME:/fsx fstype=nfs opts=nfsvers=$NFS_VERSION,_netdev,nconnect=16,x-systemd.automount,x-systemd.requires=network-online.target dump=0 passno=0 state=mounted"; then
98+
echo "[WARN] Mount command failed — retrying in $delay seconds"
99+
sleep "$delay"; ((attempt++)); continue
100+
fi
101+
102+
echo "[STEP] Verifying mountpoint..."
103+
if ! ansible localhost -b -m ansible.builtin.command -a "mountpoint $OPENZFS_MOUNT_POINT"; then
104+
echo "[WARN] Mountpoint verification failed — retrying in $delay seconds"
105+
sleep "$delay"; ((attempt++)); continue
106+
fi
83107

84-
retry_with_backoff $MAX_ATTEMPTS $INITIAL_BACKOFF "ansible localhost -b -m ansible.posix.mount -a \"path=$OPENZFS_MOUNT_POINT src=$FSX_OPENZFS_DNS_NAME:/fsx fstype=nfs opts=nfsvers=$NFS_VERSION,_netdev,nconnect=16,x-systemd.automount,x-systemd.requires=network-online.target dump=0 passno=0 state=mounted\""
108+
echo "[STEP] Triggering automount..."
109+
ls -la "$OPENZFS_MOUNT_POINT" >/dev/null 2>&1 || true
110+
111+
echo "[STEP] Testing file access (touch)..."
112+
if ! ansible localhost -b -m ansible.builtin.file -a "path=$test_file state=touch"; then
113+
echo "[WARN] Touch failed — retrying in $delay seconds"
114+
sleep "$delay"; ((attempt++)); continue
115+
fi
116+
117+
echo "[STEP] Testing file access (delete)..."
118+
if ! ansible localhost -b -m ansible.builtin.file -a "path=$test_file state=absent"; then
119+
echo "[WARN] Delete failed — retrying in $delay seconds"
120+
sleep "$delay"; ((attempt++)); continue
121+
fi
122+
123+
echo "[SUCCESS] FSx OpenZFS mount succeeded on attempt $attempt"
124+
return 0
125+
done
126+
127+
echo "[ERROR] FSx OpenZFS mount failed after $max_attempts attempts"
128+
return 1
85129
}
86130

87-
# Verify mount was successful
88-
verify_mount()
131+
# Restart systemd daemon to ensure mount units are properly loaded
132+
restart_daemon()
89133
{
90-
if ! mountpoint -q "$OPENZFS_MOUNT_POINT"; then
91-
echo "Failed to verify mount point $OPENZFS_MOUNT_POINT"
92-
exit 1
93-
fi
134+
ansible localhost -b -m ansible.builtin.systemd -a "daemon_reload=yes"
135+
ansible localhost -b -m ansible.builtin.systemd -a "name=remote-fs.target state=restarted"
136+
echo "Check status of OpenZFS automount..."
137+
systemctl list-units | grep -i automount || true
94138
}
95139

96140
main()
@@ -99,8 +143,8 @@ main()
99143
echo "Using openzfs_mount_point: $OPENZFS_MOUNT_POINT"
100144
verify_parameters
101145
install_nfs_client
102-
mount_fs
103-
verify_mount
146+
mount_fs || exit 1
147+
restart_daemon
104148
echo "FSx OpenZFS mounted successfully to $OPENZFS_MOUNT_POINT"
105149
}
106150

1.architectures/5.sagemaker-hyperpod/LifecycleScripts/base-config/utils/fsx_ubuntu.sh

Lines changed: 136 additions & 36 deletions
Original file line numberDiff line numberDiff line change
@@ -7,6 +7,10 @@ FSX_OZFS_EXISTS=$1
77
FSX_OPENZFS_DNS_NAME="/home"
88
FSX_L_DNS_NAME="/fsx"
99

10+
# Look for shared_users.txt in parent directory
11+
SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
12+
SHARED_USER_FILE="${SCRIPT_DIR}/../shared_users.txt"
13+
1014
# Function to check mount
1115
check_mount()
1216
{
@@ -36,56 +40,152 @@ wait_for_mount()
3640
done
3741
}
3842

43+
# Function to setup home directory for a user with OpenZFS
44+
setup_user_home_openzfs()
45+
{
46+
local username=$1
47+
48+
echo "Setting up OpenZFS home directory for user: $username"
49+
50+
# Create user directory on OpenZFS
51+
ansible localhost -b -m ansible.builtin.file -a "path='$FSX_OPENZFS_DNS_NAME/$username' state=directory owner=$username group=$username mode=0755"
52+
53+
# Set home directory to /home/username
54+
ansible localhost -b -m ansible.builtin.user -a "name=$username home='$FSX_OPENZFS_DNS_NAME/$username' move_home=yes"
55+
echo "Home directory set to $FSX_OPENZFS_DNS_NAME/$username"
56+
57+
# Maintain access to /fsx/username
58+
if wait_for_mount "$FSX_L_DNS_NAME"; then
59+
sudo mkdir -p "$FSX_L_DNS_NAME/$username"
60+
sudo chown "$username:$username" "$FSX_L_DNS_NAME/$username"
61+
else
62+
echo "Warning: FSx Lustre mount not available, skipping $FSX_L_DNS_NAME/$username setup"
63+
fi
64+
}
65+
66+
# Function to setup home directory for a user with FSx Lustre only
67+
setup_user_home_fsx_lustre()
68+
{
69+
local username=$1
70+
local fsx_home=$2
71+
72+
echo "Setting up FSx Lustre home directory for user: $username at $fsx_home"
73+
74+
if [ -d "$fsx_home" ]; then
75+
sudo usermod -d "$fsx_home" "$username"
76+
elif [ -d "$FSX_L_DNS_NAME" ]; then
77+
# Create the directory
78+
sudo mkdir -p "$fsx_home"
79+
sudo chown "$username:$username" "$fsx_home"
80+
81+
# Try to change home directory with move
82+
if ! sudo usermod -m -d "$fsx_home" "$username"; then
83+
echo "Warning: Could not move home directory for $username. Setting home without moving files."
84+
85+
# If user has existing home, copy contents
86+
if [ -d "/home/$username" ]; then
87+
sudo rsync -a "/home/$username/" "$fsx_home/"
88+
fi
89+
sudo chown -R "$username:$username" "$fsx_home"
90+
91+
sudo usermod -d "$fsx_home" "$username"
92+
else
93+
echo "Home directory moved successfully to $fsx_home"
94+
fi
95+
fi
96+
}
97+
3998
if [ -z "$FSX_OZFS_EXISTS" ]; then
4099
echo "Error: Missing parameter. Usage: $0 <1|0> (1 if OpenZFS exists, 0 otherwise)"
41100
exit 1
42101
fi
43102

44103
# Check if OpenZFS is mounted
45104
if [ $FSX_OZFS_EXISTS -eq 1 ]; then
46-
echo "OpenZFS is mounted. Looping to ensure FSxOZFS is mounted."
47-
105+
echo "OpenZFS is mounted. Setting up home directories on OpenZFS."
106+
48107
if wait_for_mount "$FSX_OPENZFS_DNS_NAME"; then
49-
ansible localhost -b -m ansible.builtin.file -a "path='$FSX_OPENZFS_DNS_NAME/ubuntu' state=directory owner=ubuntu group=ubuntu mode=0755"
50-
51-
echo "OpenZFS is mounted at $FSX_OPENZFS_DNS_NAME"
52-
# Set home directory to /home/ubuntu
53-
ansible localhost -b -m ansible.builtin.user -a "name=ubuntu home='$FSX_OPENZFS_DNS_NAME/ubuntu' move_home=yes"
54-
echo "Home directory set to $FSX_OPENZFS_DNS_NAME/ubuntu"
55-
56-
# Maintain access to /fsx/ubuntu
57-
if wait_for_mount "$FSX_L_DNS_NAME"; then
58-
sudo mkdir -p "$FSX_L_DNS_NAME/ubuntu"
59-
sudo chown ubuntu:ubuntu "$FSX_L_DNS_NAME/ubuntu"
108+
# Setup ubuntu user first
109+
echo "Setting up home directory for default ubuntu user..."
110+
setup_user_home_openzfs "ubuntu"
111+
112+
# Process additional users from shared_users.txt if it exists
113+
if [[ -f $SHARED_USER_FILE ]]; then
114+
echo "Found $SHARED_USER_FILE, processing additional users..."
115+
echo "Contents of $SHARED_USER_FILE:"
116+
cat "$SHARED_USER_FILE"
117+
118+
while IFS="," read -r username uid home; do
119+
# Trim whitespace from all fields
120+
username=$(echo "$username" | xargs)
121+
uid=$(echo "$uid" | xargs)
122+
home=$(echo "$home" | xargs)
123+
124+
# Skip empty lines or lines that are just whitespace
125+
if [[ -z "$username" ]]; then
126+
echo "Skipping empty or invalid line"
127+
continue
128+
fi
129+
130+
# Verify user exists before trying to set up home
131+
if ! id -u "$username" >/dev/null 2>&1; then
132+
echo "WARNING: User $username does not exist, skipping home setup"
133+
continue
134+
fi
135+
136+
echo "Processing home directory for user: '$username'"
137+
setup_user_home_openzfs "$username"
138+
done < "$SHARED_USER_FILE"
139+
140+
echo "All users from $SHARED_USER_FILE processed successfully"
60141
else
61-
echo "Warning: FSx mount not available, skipping $FSX_L_DNS_NAME/ubuntu setup"
142+
echo "No $SHARED_USER_FILE found, only ubuntu user configured"
62143
fi
63144
fi
64145
else
65-
echo "OpenZFS is not mounted. Skipped OZFS check loop, and looping for FSxL only."
66-
echo "Using FSxL file system as home..."
67-
146+
echo "OpenZFS is not mounted. Using FSx Lustre file system as home..."
147+
68148
if ! wait_for_mount "$FSX_L_DNS_NAME"; then
69149
echo "Warning: FSx mount not available. Exiting."
70150
exit 1
71151
fi
72-
if [ -d "$FSX_L_DNS_NAME/ubuntu" ]; then
73-
sudo usermod -d "$FSX_L_DNS_NAME/ubuntu" ubuntu
74-
elif [ -d "$FSX_L_DNS_NAME" ]; then
75-
# Create the directory (race condition: if it doesn't get detected)
76-
sudo mkdir -p "$FSX_L_DNS_NAME/ubuntu"
77-
sudo chown ubuntu:ubuntu "$FSX_L_DNS_NAME/ubuntu"
78-
79-
# Try to change home directory with move (race condition)
80-
if ! sudo usermod -m -d "$FSX_L_DNS_NAME/ubuntu" ubuntu; then
81-
echo "Warning: Could not move home directory. Setting home without moving files."
82-
83-
sudo rsync -a /home/ubuntu/ "$FSX_L_DNS_NAME/ubuntu/"
84-
sudo chown -R ubuntu:ubuntu "$FSX_L_DNS_NAME/ubuntu"
85-
86-
sudo usermod -d "$FSX_L_DNS_NAME/ubuntu" ubuntu
87-
else
88-
echo "Home directory moved successfully to $FSX_L_DNS_NAME/ubuntu"
89-
fi
152+
153+
# Setup ubuntu user first
154+
echo "Setting up home directory for default ubuntu user..."
155+
setup_user_home_fsx_lustre "ubuntu" "$FSX_L_DNS_NAME/ubuntu"
156+
157+
# Process additional users from shared_users.txt if it exists
158+
if [[ -f $SHARED_USER_FILE ]]; then
159+
echo "Found $SHARED_USER_FILE, processing additional users..."
160+
echo "Contents of $SHARED_USER_FILE:"
161+
cat "$SHARED_USER_FILE"
162+
163+
while IFS="," read -r username uid home; do
164+
# Trim whitespace from all fields
165+
username=$(echo "$username" | xargs)
166+
uid=$(echo "$uid" | xargs)
167+
home=$(echo "$home" | xargs)
168+
169+
# Skip empty lines or lines that are just whitespace
170+
if [[ -z "$username" ]] || [[ -z "$home" ]]; then
171+
echo "Skipping empty or invalid line"
172+
continue
173+
fi
174+
175+
# Verify user exists before trying to set up home
176+
if ! id -u "$username" >/dev/null 2>&1; then
177+
echo "WARNING: User $username does not exist, skipping home setup"
178+
continue
179+
fi
180+
181+
echo "Processing home directory for user: '$username' at '$home'"
182+
setup_user_home_fsx_lustre "$username" "$home"
183+
done < "$SHARED_USER_FILE"
184+
185+
echo "All users from $SHARED_USER_FILE processed successfully"
186+
else
187+
echo "No $SHARED_USER_FILE found, only ubuntu user configured"
90188
fi
91-
fi
189+
fi
190+
191+
echo "Home directory setup completed for all users"

0 commit comments

Comments
 (0)