Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
22 changes: 22 additions & 0 deletions apache/livy/ExposedUI/README.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,22 @@
# Setup Apache Livy with Docker Compose

```bash
docker compose build spark-master
docker compose up
```
# Access the Livy UI and execute PySpark code
```bash
curl -X POST -H "Content-Type: application/json" -d '{"kind":"pyspark"}' http://localhost:8998/sessions
# {"id":6,"name":null,"appId":null,"owner":null,"proxyUser":null,"state":"starting","kind":"pyspark","appInfo":{"driverLogUrl":null,"sparkUiUrl":null},"log":["stdout: ","\nstderr: "],"ttl":null,"driverMemory":null,"driverCores":0,"executorMemory":null,"executorCores":0,"conf":{},"archives":[],"files":[],"heartbeatTimeoutInSecond":0,"jars":[],"numExecutors":0,"pyFiles":[],"queue":null}

# replace id from last response with $id
curl -X POST -H "Content-Type: application/json" -d '{"code":"import os\nprint(os.getcwd())"}' http://localhost:8998/sessions/$id/statements
# "java.lang.IllegalStateException: Session is in state starting"
# wait 30sec
# {"id":0,"code":"import os\nprint(os.getcwd())","state":"waiting","output":null,"progress":0.0,"started":0,"completed":0}

# replace id from last reseponse with #statements_id
curl http://127.0.0.1:8998/sessions/$id/statements/$statements_id
# output.data is the stdout
# {"id":0,"code":"import os\nprint(os.getcwd())","state":"available","output":{"status":"ok","execution_count":0,"data":{"text/plain":"/opt"}},"progress":1.0,"started":1754515902001,"completed":1754515902003}
```
16 changes: 16 additions & 0 deletions apache/livy/ExposedUI/apache-livy/Dockerfile
Original file line number Diff line number Diff line change
@@ -0,0 +1,16 @@
#https://github.com/apache/incubator-livy?tab=readme-ov-file#building-livy
# Reuse the same image built for Spark Master/Worker
FROM mounirbs-local/spark-python3-java11:3.5.4
USER root
ENV LIVY_HOME /opt/livy
WORKDIR /opt/
# Get livy binaries from: https://livy.apache.org/download/
RUN apt-get update && apt-get install -y unzip \
&& curl "https://dlcdn.apache.org/incubator/livy/0.8.0-incubating/apache-livy-0.8.0-incubating_2.12-bin.zip" -O \
&& unzip "apache-livy-0.8.0-incubating_2.12-bin" \
&& rm -rf "apache-livy-0.8.0-incubating_2.12-bin.zip" \
&& mv "apache-livy-0.8.0-incubating_2.12-bin" $LIVY_HOME \
&& mkdir $LIVY_HOME/logs \
&& chown -R spark:spark $LIVY_HOME

USER spark
108 changes: 108 additions & 0 deletions apache/livy/ExposedUI/apache-livy/conf/livy-client.conf
Original file line number Diff line number Diff line change
@@ -0,0 +1,108 @@
#
# Licensed to the Apache Software Foundation (ASF) under one or more
# contributor license agreements. See the NOTICE file distributed with
# this work for additional information regarding copyright ownership.
# The ASF licenses this file to You under the Apache License, Version 2.0
# (the "License"); you may not use this file except in compliance with
# the License. You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
#

#
# Configurations for a Livy Client, any configurations set here will override any
# livy or spark-default configurations.
#
# Before a Livy Client is able to load these configurations the folder containing
# this file must be added to the application classpath
#

#
# Configurations for Livy HTTPClient
#

# HTTP Request configurations
# How long before a request times out
# livy.client.http.connection.timeout = 10s
# How long between data packets before a request times out
# livy.client.http.connection.socket.timeout = 5m
# Whether content is compressed
# livy.client.http.content.compress.enable = true

# How long before idle connections are closed
# livy.client.http.connection.idle.timeout = 10m

# Initial interval before polling for Job results
# livy.client.http.job.initial-poll-interval = 100ms
# Maximum interval between successive polls
# livy.client.http.job.max-poll-interval = 5s

#
# Configurations for Livy RSCClient
#

# Configurations for registering a client with the rpc server
# Unique client id for connections to the rpc server
# livy.rsc.client.auth.id =
# Secret value for authenticating client connections with server
# livy.rsc.client.auth.secret =

# Timeout when stopping a rsc client
# livy.rsc.client.shutdown-timeout = 10s

# Class of the rsc driver to use
# livy.rsc.driver-class =
# The kind of rsc session. Examples: pyspark or sparkr
# livy.rsc.session.kind =

# Comma-separated list of Livy RSC jars. By default Livy will upload jars from its installation
# directory every time a session is started. By caching these files in HDFS, for example, startup
# time of sessions on YARN can be reduced.
# livy.rsc.jars =
# Location of the SparkR package for running sparkr
# livy.rsc.sparkr.package =
# Location of the PySpark package for running pyspark
# livy.rsc.pyspark.archives =

# Address for the RSC driver to connect back with it's connection info.
# livy.rsc.launcher.address =

# Port Range on which RPC will launch . Port range in inclusive of start and end port .
livy.rsc.launcher.port.range = 10000~10010

# How long will the RSC wait for a connection for a Livy server before shutting itself down.
livy.rsc.server.idle-timeout = 10m

# The user that should be impersonated when requesting a Livy session
# livy.rsc.proxy-user =

# Host or IP adress of the rpc server

#livy.rsc.rpc.server.address = livy-server
# How long the rsc client will wait when attempting to connect to the Livy server
#livy.rsc.server.connect.timeout = 90s

# The logging level for the rpc channel. Possible values: TRACE, DEBUG, INFO, WARN, or ERROR
livy.rsc.channel.log.level = ERROR

# SASL configurations for authentication
# SASL mechanism used for authentication
# livy.rsc.rpc.sasl.mechanisms = DIGEST-MD5
# SASL qop used for authentication
# livy.rsc.rpc.sasl.qop =

# Time between status checks for cancelled a Job
# livy.rsc.job-cancel.trigger-interval = 100ms
# Time before a cancelled a Job is forced into a Cancelled state
# livy.rsc.job-cancel.timeout = 30s

# Number of statements kept in driver's memory
# livy.rsc.retained-statements = 100
#
livy.rsc.jars = /opt/livy/rsc-jars/livy-api-0.8.0-incubating.jar, /opt/livy/rsc-jars/livy-rsc-0.8.0-incubating.jar
38 changes: 38 additions & 0 deletions apache/livy/ExposedUI/apache-livy/conf/livy-env.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,38 @@
#!/usr/bin/env bash
#
# Licensed to the Apache Software Foundation (ASF) under one or more
# contributor license agreements. See the NOTICE file distributed with
# this work for additional information regarding copyright ownership.
# The ASF licenses this file to You under the Apache License, Version 2.0
# (the "License"); you may not use this file except in compliance with
# the License. You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
#
# LIVY ENVIRONMENT VARIABLES
#
# - JAVA_HOME Java runtime to use. By default use "java" from PATH.
# - HADOOP_CONF_DIR Directory containing the Hadoop / YARN configuration to use.
# - SPARK_HOME Spark which you would like to use in Livy.
# - SPARK_CONF_DIR Optional directory where the Spark configuration lives.
# (Default: $SPARK_HOME/conf)
# - LIVY_LOG_DIR Where log files are stored. (Default: ${LIVY_HOME}/logs)
# - LIVY_PID_DIR Where the pid file is stored. (Default: /tmp)
# - LIVY_SERVER_JAVA_OPTS Java Opts for running livy server (You can set jvm related setting here,
# like jvm memory/gc algorithm and etc.)
# - LIVY_IDENT_STRING A name that identifies the Livy server instance, used to generate log file
# names. (Default: name of the user starting Livy).
# - LIVY_MAX_LOG_FILES Max number of log file to keep in the log directory. (Default: 5.)
# - LIVY_NICENESS Niceness of the Livy server process when running in the background. (Default: 0.)
# - LIVY_CLASSPATH Override if the additional classpath is required.

export JAVA_HOME=/opt/java/openjdk
export SPARK_HOME=/opt/spark
export LIVY_LOG_DIR=/opt/livy/logs
export SPARK_CONF_DIR=/opt/spark/conf
198 changes: 198 additions & 0 deletions apache/livy/ExposedUI/apache-livy/conf/livy.conf
Original file line number Diff line number Diff line change
@@ -0,0 +1,198 @@
#
# Licensed to the Apache Software Foundation (ASF) under one or more
# contributor license agreements. See the NOTICE file distributed with
# this work for additional information regarding copyright ownership.
# The ASF licenses this file to You under the Apache License, Version 2.0
# (the "License"); you may not use this file except in compliance with
# the License. You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
#
# Use this keystore for the SSL certificate and key.
# livy.keystore =

# Specify the keystore password.
# livy.keystore.password =
#
# Specify the key password.
# livy.key-password =

# Hadoop Credential Provider Path to get "livy.keystore.password" and "livy.key-password".
# Credential Provider can be created using command as follow:
# hadoop credential create "livy.keystore.password" -value "secret" -provider jceks://hdfs/path/to/livy.jceks
# livy.hadoop.security.credential.provider.path =

# What host address to start the server on. By default, Livy will bind to all network interfaces.
livy.server.host = 0.0.0.0

# What port to start the server on.
livy.server.port = 8998

# What base path ui should work on. By default UI is mounted on "/".
# E.g.: livy.ui.basePath = /my_livy - result in mounting UI on /my_livy/
# livy.ui.basePath = ""

# What spark master Livy sessions should use.
livy.spark.master = spark://spark-master:7077

# What spark deploy mode Livy sessions should use.
livy.spark.deploy-mode = client

# Configure Livy server http request and response header size.
# livy.server.request-header.size = 131072
# livy.server.response-header.size = 131072

# Whether or not to send server version in http response.
# livy.server.send-server-version = false

# Enabled to check whether timeout Livy sessions should be stopped.
#livy.server.session.timeout-check = true
#
# Whether or not to skip timeout check for a busy session
#livy.server.session.timeout-check.skip-busy = false

# Time in milliseconds on how long Livy will wait before timing out an inactive session.
# Note that the inactive session could be busy running jobs.
#livy.server.session.timeout = 5m
#
# How long a finished session state should be kept in LivyServer for query.
#livy.server.session.state-retain.sec = 120s

# If livy should impersonate the requesting users when creating a new session.
livy.impersonation.enabled = true

# Logs size livy can cache for each session/batch. 0 means don't cache the logs.
# livy.cache-log.size = 200

# Comma-separated list of Livy RSC jars. By default Livy will upload jars from its installation
# directory every time a session is started. By caching these files in HDFS, for example, startup
# time of sessions on YARN can be reduced.
# livy.rsc.jars =

# Comma-separated list of Livy REPL jars. By default Livy will upload jars from its installation
# directory every time a session is started. By caching these files in HDFS, for example, startup
# time of sessions on YARN can be reduced. Please list all the repl dependencies including
# Scala version-specific livy-repl jars, Livy will automatically pick the right dependencies
# during session creation.
# livy.repl.jars =

# Location of PySpark archives. By default Livy will upload the file from SPARK_HOME, but
# by caching the file in HDFS, startup time of PySpark sessions on YARN can be reduced.
# livy.pyspark.archives =

# Location of the SparkR package. By default Livy will upload the file from SPARK_HOME, but
# by caching the file in HDFS, startup time of R sessions on YARN can be reduced.
# livy.sparkr.package =

# List of local directories from where files are allowed to be added to user sessions. By
# default it's empty, meaning users can only reference remote URIs when starting their
# sessions.
livy.file.local-dir-whitelist = /target/

# Whether to enable csrf protection, by default it is false. If it is enabled, client should add
# http-header "X-Requested-By" in request if the http method is POST/DELETE/PUT/PATCH.
# livy.server.csrf-protection.enabled =

# Whether to enable HiveContext in livy interpreter, if it is true hive-site.xml will be detected
# on user request and then livy server classpath automatically.
# livy.repl.enable-hive-context =

# Recovery mode of Livy. Possible values:
# off: Default. Turn off recovery. Every time Livy shuts down, it stops and forgets all sessions.
# recovery: Livy persists session info to the state store. When Livy restarts, it recovers
# previous sessions from the state store.
# Must set livy.server.recovery.state-store and livy.server.recovery.state-store.url to
# configure the state store.
# livy.server.recovery.mode = off
# Zookeeper address used for HA and state store. e.g. host1:port1, host2:port2
# livy.server.zookeeper.url =

# Where Livy should store state to for recovery. Possible values:
# <empty>: Default. State store disabled.
# filesystem: Store state on a file system.
# zookeeper: Store state in a Zookeeper instance.
# livy.server.recovery.state-store =

# For filesystem state store, the path of the state store directory. Please don't use a filesystem
# that doesn't support atomic rename (e.g. S3). e.g. file:///tmp/livy or hdfs:///.
# For zookeeper, the address to the Zookeeper servers. e.g. host1:port1,host2:port2
# If livy.server.recovery.state-store is zookeeper, this config is for back-compatibility,
# so if both this config and livy.server.zookeeper.url exist,
# livy uses livy.server.zookeeper.url first.
# livy.server.recovery.state-store.url =

# The policy of curator connecting to zookeeper.
# For example, m, n means retry m times and the interval of retry is n milliseconds.
# Please use the new config: livy.server.zk.retry-policy.
# Keep this config for back-compatibility.
# If both this config and livy.server.zk.retry-policy exist,
# livy uses livy.server.zk.retry-policy first.
# livy.server.recovery.zk-state-store.retry-policy = 5,100

# The policy of curator connecting to zookeeper.
# For example, m, n means retry m times and the interval of retry is n milliseconds
# livy.server.zk.retry-policy =

# The dir in zk to store the data about session.
# livy.server.recovery.zk-state-store.key-prefix = livy

# If Livy can't find the yarn app within this time, consider it lost.
# livy.server.yarn.app-lookup-timeout = 120s
# When the cluster is busy, we may fail to launch yarn app in app-lookup-timeout, then it would
# cause session leakage, so we need to check session leakage.
# How long to check livy session leakage
# livy.server.yarn.app-leakage.check-timeout = 600s
# how often to check livy session leakage
# livy.server.yarn.app-leakage.check-interval = 60s

# How often Livy polls YARN to refresh YARN app state.
# livy.server.yarn.poll-interval = 5s
#
# Days to keep Livy server request logs.
# livy.server.request-log-retain.days = 5

# If the Livy Web UI should be included in the Livy Server. Enabled by default.
livy.ui.enabled = true

# Whether to enable Livy server access control, if it is true then all the income requests will
# be checked if the requested user has permission.
# livy.server.access-control.enabled = false

# Allowed users to access Livy, by default any user is allowed to access Livy. If user want to
# limit who could access Livy, user should list all the permitted users with comma separated.
# livy.server.access-control.allowed-users = *

# A list of users with comma separated has the permission to change other user's submitted
# session, like submitting statements, deleting session.
# livy.server.access-control.modify-users =

# A list of users with comma separated has the permission to view other user's infomation, like
# submitted session state, statement results.
# livy.server.access-control.view-users =
#
# Authentication support for Livy server
# Livy has a built-in SPnego authentication support for HTTP requests with below configurations.
# livy.server.auth.type = kerberos
# livy.server.auth.kerberos.principal = <spnego principal>
# livy.server.auth.kerberos.keytab = <spnego keytab>
# livy.server.auth.kerberos.name-rules = DEFAULT
#
# If user wants to use custom authentication filter, configurations are:
# livy.server.auth.type = <custom>
# livy.server.auth.<custom>.class = <class of custom auth filter>
# livy.server.auth.<custom>.param.<foo1> = <bar1>
# livy.server.auth.<custom>.param.<foo2> = <bar2>

# Enable to allow custom classpath by proxy user in cluster mode
# The below configuration parameter is disabled by default.
# livy.server.session.allow-custom-classpath = true

livy.repl.jars = /opt/livy/jars/livy-client-common-0.8.0-incubating.jar, /opt/livy/repl_2.12-jars/livy-core_2.12-0.8.0-incubating.jar, /opt/livy/repl_2.12-jars/commons-codec-1.9.jar, /opt/livy/repl_2.12-jars/livy-core_2.12-0.8.0-incubating.jar, /opt/livy/repl_2.12-jars/minlog-1.3.0.jar, /opt/livy/repl_2.12-jars/kryo-shaded-4.0.2.jar, /opt/livy/repl_2.12-jars/livy-repl_2.12-0.8.0-incubating.jar, /opt/livy/repl_2.12-jars/objenesis-2.5.1.jar

livy.rsc.jars = /opt/livy/rsc-jars/livy-api-0.8.0-incubating.jar, /opt/livy/rsc-jars/livy-rsc-0.8.0-incubating.jar
Loading