5
5
# from 'requirements.txt' and 'bootstrap_on_start' parameter was passed to the wrapper, or manually
6
6
# from training / processing / inference script, e. g. with subprocess.check_call()
7
7
8
- # This script can be called simultaneously multiple times in a distributed training job
8
+ # This script can be called simultaneously multiple times in a distributed training or inference job
9
9
# To avoid race conditions, we install helper scripts under an exclusive lock
10
10
if [[ " $1 " == " install-helper-scripts" ]]; then
11
11
dir=$( dirname " $0 " )
17
17
18
18
set -e
19
19
20
- flock /tmp/sm-install-lock bash " $0 " install-helper-scripts | sed ' s/^/[sagemaker-ssh-helper][sm-setup-ssh] /'
20
+ if [[ " $1 " == " start-ssh" ]]; then
21
+ dir=$( dirname " $0 " )
22
+ source " $dir " /sm-helper-functions
23
+
24
+ # Log IP addresses of the container (useful only in training in combination with VPC + VPN)
25
+ echo " SSH Helper Log IP: $( hostname -I) "
26
+
27
+ chmod 1777 /tmp
28
+ mkdir -p ~ /.ssh
29
+
30
+ # Install SSH (if using MPI, already installed)
31
+ if _is_centos; then
32
+ yum install -y openssh-server
33
+ else
34
+ export DEBIAN_FRONTEND=noninteractive
35
+ apt-get update
36
+ apt-get install -y --no-install-recommends openssh-server
37
+ fi
38
+
39
+ # Save and dump SageMaker environment for SSH sessions
40
+ sm-save-env
41
+
42
+ # Dump container bootstrap environment (PID 1) - can be different from above, useful for debugging
43
+ ps wwwe -p 1 | tail -1
44
+
45
+ sed -i -e ' s~^ClientAliveInterval~#ClientAliveInterval~' /etc/ssh/sshd_config
46
+ echo " ClientAliveInterval 15" >> /etc/ssh/sshd_config
47
+
48
+ sed -i -e ' s~^PermitRootLogin~#PermitRootLogin~' /etc/ssh/sshd_config
49
+ echo PermitRootLogin yes >> /etc/ssh/sshd_config
50
+
51
+ sed -i -e ' s~^AuthorizedKeysFile~#AuthorizedKeysFile~' /etc/ssh/sshd_config
52
+ echo " AuthorizedKeysFile /etc/ssh/authorized_keys" >> /etc/ssh/sshd_config
53
+
54
+ # Start SSH server
55
+ if _is_centos; then
56
+ # NOTE: systemctl will not work in CentOS SageMaker container (e.g. Spark processing) because lack of
57
+ # privileges to access DBUS, so we run sshd manually. This command doesn't work:
58
+ # # service sshd start || (echo "ERROR: Failed to start sshd service" && exit 255)
59
+ [[ -f /etc/ssh/ssh_host_rsa_key ]] || (echo " Generating new SSH keys" && ssh-keygen -A)
60
+ /usr/sbin/sshd
61
+ else
62
+ service ssh start || (echo " ERROR: Failed to start ssh service" && exit 255)
63
+ fi
64
+
65
+ sm-init-ssm
66
+
67
+ # Running forever as daemon
68
+ amazon-ssm-agent
69
+
70
+ echo " ERROR: agent died"
71
+ exit 1 # should never reach this line
72
+ fi
73
+
74
+ flock /tmp/sm-install-lock bash " $0 " install-helper-scripts \
75
+ | sed -u ' s/^/[sagemaker-ssh-helper][sm-setup-ssh] /'
21
76
22
77
# nohup will detach the child process from parent and run it in background
23
78
# flock prevents from starting more than 1 process
24
79
# redirection to /proc/1/fd/1 will write logs to CloudWatch
25
80
# sed will prepend log output with SSH Helper prefix
26
81
if [[ ! -f /tmp/sm-start-ssh-lock ]]; then
27
82
if [[ " $SSH_LOG_TO_STDOUT " == " true" ]]; then
28
- flock -n /tmp/sm-start-ssh-lock sm- start-ssh &
83
+ flock -n /tmp/sm-start-ssh-lock bash " $0 " start-ssh &
29
84
else
30
85
nohup flock -n /tmp/sm-start-ssh-lock \
31
- sm- start-ssh 2>&1 \
86
+ bash " $0 " start-ssh 2>&1 \
32
87
| sed -u ' s/^/[sagemaker-ssh-helper][sm-start-ssh] /' \
33
88
> /proc/1/fd/1 2>&1 &
34
89
fi
35
90
fi
36
91
37
- sm-wait " ${SSH_WAIT_TIME_SECONDS:- 60} " | sed ' s/^/[sagemaker-ssh-helper][sm-setup-ssh] /'
92
+ sm-wait " ${SSH_WAIT_TIME_SECONDS:- 60} " \
93
+ | sed -u ' s/^/[sagemaker-ssh-helper][sm-setup-ssh] /'
0 commit comments