srivathsashreyas
diff --git a/‎.gitignore‎
Lines changed: 10 additions & 0 deletions b/‎.gitignore‎
Lines changed: 10 additions & 0 deletions
diff --git a/‎.vscode/launch.json‎
Lines changed: 21 additions & 0 deletions b/‎.vscode/launch.json‎
Lines changed: 21 additions & 0 deletions
diff --git a/‎LICENSE‎
Lines changed: 21 additions & 0 deletions b/‎LICENSE‎
Lines changed: 21 additions & 0 deletions
diff --git a/‎README.md‎
Lines changed: 86 additions & 0 deletions b/‎README.md‎
Lines changed: 86 additions & 0 deletions
diff --git a/‎client/__init__.py‎ b/‎client/__init__.py‎
diff --git a/‎client/client.py‎
Lines changed: 32 additions & 0 deletions b/‎client/client.py‎
Lines changed: 32 additions & 0 deletions
diff --git a/‎db/__init__.py‎ b/‎db/__init__.py‎
diff --git a/‎db/connect.py‎
Lines changed: 9 additions & 0 deletions b/‎db/connect.py‎
Lines changed: 9 additions & 0 deletions
diff --git a/‎deploy/client/Dockerfile‎
Lines changed: 29 additions & 0 deletions b/‎deploy/client/Dockerfile‎
Lines changed: 29 additions & 0 deletions
diff --git a/‎deploy/client/client.yaml‎
Lines changed: 73 additions & 0 deletions b/‎deploy/client/client.yaml‎
Lines changed: 73 additions & 0 deletions
@@ -0,0 +1,10 @@
+artifacts/
+.venv/
+dump.rdb
+*.tar
+client/__pycache__/
+db/__pycache__/
+grpc_config/out/__pycache__
+processor/__pycache__/
+producer/__pycache__/
+server/__pycache__/
@@ -0,0 +1,21 @@
+{
+    "version": "0.2.0",
+    "configurations": [
+        {
+            "name": "Producer",
+            "type": "debugpy",
+            "request": "launch",
+            "module": "producer.producer",   //use module instead of program to deal with dependencies involving other user defined modules
+            "console": "integratedTerminal",
+            "cwd": "${workspaceRoot}"
+        },
+        {
+            "name": "Processor",
+            "type": "debugpy",
+            "request": "launch",
+            "module": "processor.processor", //use module instead of program to deal with dependencies involving other user defined modules
+            "console": "integratedTerminal",
+            "cwd": "${workspaceRoot}"
+        }
+    ]
+}
@@ -0,0 +1,21 @@
+MIT License
+
+Copyright (c) 2025 Shreyas Srivathsa <srivathsashreyas@gmail.com>
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in all
+copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+SOFTWARE.
@@ -0,0 +1,86 @@
+# Overview
+
+This is a personal project (purely for learning) to briefly explore setting up a few components in a simple producer consumer application. The components include:
+1. Producer -> A simple FastAPI microservice which generates a json message representing
+an action (in this case 'login') performed by a user. The message is sent to a Kafka broker
+2. Kafka Broker -> Used to store messages sent by the producer. The processor component streams messages from the broker 
+3. Processor -> A Spark based application which uses structured streaming to pull messages from the kafka broker. It processes the messages, computes the count of a particular action and writes the result to redis
+4. Redis -> Used to store the raw data (login information) and the metrics (count of logins)
+5. Grpc Server -> A simple grpc server which exposes an api to retrieve the raw data and/or metrics from Redis
+6. Grpc Client/Consumer -> A FastAPI microservice which includes a grpc client that interacts with the grpc server to retrieve the raw data and/or metrics
+
+## Structure
+
+The project is structured as follows:
+1. client/ -> Includes logic for the grpc client/consumer component
+2. db/ -> Includes logic to connect to redis
+3. grpc_config/ -> Includes the proto file and generated grpc code
+4. processor/ -> Includes logic for the spark based processor component
+5. producer/ -> Includes logic for the producer component
+6. server/ -> Includes logic for the grpc server component
+7. deploy/client -> Includes k8s manifest and Dockerfile for the consumer component
+8. deploy/processor -> Includes k8s manifest and Dockerfile for the processor component
+9. deploy/producer -> Includes k8s manifest and Dockerfile for the producer component
+10. deploy/server -> Includes k8s manifest and Dockerfile for the grpc server component
+11. deploy/kafka -> Includes k8s manifest to setup a kafka broker
+12. deploy/redis -> Includes k8s manifest to setup a redis instance
+13. deploy/ingress_config -> Includes logic to apply the ingress-nginx-controller and setup metallb for load balancing. Note: you may need to configure the pool of IPs in the metallb-config.yaml file based on your local setup
+
+## Prerequisites
+
+1. Python 3.12 (it should work with 3.x versions, but not tested)
+2. Colima (on macOS). This setup should work on k3s running directly on a linux machine as well (not tested)
+3. Podman or Docker (to build container images)
+4. kubectl
+5. Helm (to install ingress-nginx-controller)
+6. Maven (to retrieve the spark-sql-kafka jar and its dependencies, tested with v3.9.11)
+7. Java (to run maven commands, tested with openjdk v17.0.16)
+
+## Deployment Steps (Local with Colima)
+
+1. Ensure you've built all relevant images. For example to build the producer image, run
+`podman build -f deploy/producer/Dockerfile -t producer .`. Ensure that the tags match those specified in the corresponding k8s manifest files
+2. Save the images as a tar file. For example, to save the producer image, run
+`podman save -o producer.tar producer:latest`
+3. Start Colima with Kubernetes enabled (skip this step if you already have k3s running on your system) -> `colima start --kubernetes --runtime containerd --cpu 4 --memory 4`
+Adjust the cpu and memory based on your system resources (though this could affect the performance of specific components, particularly spark)
+4. ssh into the colima VM -> `colima ssh`. Apply the ingress config to setup the ingress-nginx controller and metallb -> `bash deploy/ingress_config/apply_config.sh`
+5. Load the images into the k3s cluster runnning on the colima vm -> `sudo ctr -n k8s.io images import <image-name>.tar`
+6. Verify the image has been uploaded into the cluster with -> `sudo ctr -n k8s.io images list`
+7. Apply the manifests using the start_cluster_local.sh script -> `./start_cluster_local.sh`
+8. You may need to reapply the processor component if the pod hasn't started -> `kubectl apply -f deploy/processor/processor.yaml`
+9. Verify all pods are running -> `kubectl get pods -A`
+10. Retrieve the ingress IP -> `kubectl get svc -A` to identify the EXTERNAL-IP of the ingress-nginx-controller service in the ingress-nginx namespace
+11. To test, ssh into the colima vm and run the following 
+   - `curl -H "Host: producer.local" http://<INGRESS_IP>/login`. This will generate a login message and pass it on to the other components as stated in the overview section
+   -  `curl -H "Host: grpc-client.local" http://<INGRESS_IP>/metrics` to retrieve the count of logins or `curl -H "Host: grpc-client.local" http://<INGRESS_IP>/raw-data` to retrieve the last 10 raw login messages
+
+## Useful Commands (Reference)
+
+1. Create a netshoot pod to test connectivity to other pods/resources using standard network tools (curl, ping, nc etc.)  -> `kubectl run -i --image=nicolaka/netshoot --restart=Never -- bash`
+2. Exec into the netshoot pod -> `kubectl exec -it <pod-name> -- bash`
+3. Setup a kafka pod with client tools to debug/test the kafka broker ->  `kubectl run kafka-client --restart="Never" --image=confluentinc/cp-kafka:7.6.0 -- sleep infinity`
+4. If the pod is already running: `kubectl exec -it kafka-client -- bash`
+   Run kafka commands inside the pod:
+   -  to list topics - `kafka-topics --bootstrap-server kafka-0.kafka.default.svc.cluster.local:9092 --list`
+   -  to consume messages from the 'metrics' topic (refer https://stackoverflow.com/questions/38024514/understanding-kafka-topics-and-partitions for a simple description on partitions in kafka) `kafka-console-consumer --topic metrics --from-beginning --bootstrap-server kafka-0.kafka.default.svc.cluster.local:9092 --partition 0` 
+   -  to send messages to the topic 'metrics' - `kafka-console-producer --broker-list kafka-0.kafka.default.svc.cluster.local:9092 --topic metrics`
+5. To send requests to the producer service using ingress-nginx, first get the ingress IP using `kubectl get svc -A` to identify the EXTERNAL-IP of the ingress-nginx-controller service in the ingress-nginx namespace. Then, run the following command (replace <INGRESS_IP> with the actual ingress IP):
+   `curl -H "Host: producer.local" http://<INGRESS_IP>/login`
+Note: If using colima, you'll need to run colima ssh and then run the curl command from within the colima VM.
+6. Retrieve jars associated with spark-sql-kafka-0-10_2.13:4.0.0 using the following maven command:
+`mvn dependency:copy-dependencies -DoutputDirectory=./jars -DincludeScope=runtime`
+This will download the jar and its dependencies to the ./jars directory. 
+Note: pom.xml should be present in the current directory with the appropriate dependency specified. Don't forget to download the jar itself from the maven repository using this link -> https://repo1.maven.org/maven2/org/apache/spark/spark-sql-kafka-0-10_2.13/4.0.0/spark-sql-kafka-0-10_2.13-4.0.0.jar or with the mvn dependency:get command. Place this jar in the same directory (./jars) 
+7. If using separate spark master and worker nodes -> install bitnami/spark on your cluster with 2 worker pods: 
+`helm install spark oci://registry-1.docker.io/bitnamicharts/spark --set worker.replicaCount=2`
+8. To upgrade and set resource limits for the spark configuration: 
+`helm upgrade spark oci://registry-1.docker.io/bitnamicharts/spark --set worker.replicaCount=2 --set worker.resources.limits.cpu=2 --set worker.resources.limits.memory=4Gi`
+9. To generate the python grpc code from the proto file, run the following command (from the workspace root directory):
+`python -m grpc_tools.protoc -Igrpc_config/out=./grpc_config \ --python_out=. --grpc_python_out=. \ --pyi_out=. \ ./grpc_config/metrics.proto` 
+This sets the parent directory for the generated code to ./grpc_config/out.
+--python_out, --grpc_python_out and pyi_out specify the relative path (w.r.t. the parent) for the generated code (based on the methods and types specified in the .proto file)
+
+## Known Issues:
+
+1. You may need to manually re-create specific components if certain associated resources were not configured properly when using the startup script. For example, if the producer component deployment and service was created but not the ingress; delete the producer component -> `kubectl delete -f deploy/producer/producer.yaml` and re-apply the manifest -> `kubectl apply -f deploy/producer/producer.yaml`
@@ -0,0 +1,32 @@
+import grpc
+from grpc_config.out import metrics_pb2_grpc, metrics_pb2
+from google.protobuf.json_format import MessageToDict
+from fastapi import FastAPI
+import os
+
+app = FastAPI()
+
+
+@app.get("/raw-data")
+def retrieve_raw_data():
+    # TODO: add logic to set limit via query params
+    raw_data_response = stub.GetRawData(metrics_pb2.RawDataRequest(limit=10))
+    # convert protobuf message to dictionary before returning
+    return MessageToDict(raw_data_response)
+
+
+@app.get("/metrics")
+def retrieve_metrics():
+    metrics_response = stub.GetMetrics(metrics_pb2.MetricsRequest())
+    # convert protobuf message to dictionary before returning
+    return MessageToDict(metrics_response)
+
+
+grpc_server = (
+    "localhost:50051" if "GRPC_SERVER" not in os.environ else os.environ["GRPC_SERVER"]
+)
+
+## main (initialize channel to connect to grpc server and create stub to
+# call grpc server methods)
+channel = grpc.insecure_channel(grpc_server)
+stub = metrics_pb2_grpc.MetricsServiceStub(channel)
@@ -0,0 +1,9 @@
+import redis
+import os
+
+
+def get_redis_conn():
+    host = "localhost" if "REDIS_HOST" not in os.environ else os.environ["REDIS_HOST"]
+    port = "6379" if "REDIS_PORT" not in os.environ else os.environ["REDIS_PORT"]
+    r = redis.Redis(host=host, port=port, decode_responses=True)
+    return r
@@ -0,0 +1,29 @@
+ARG PYTHON_VERSION=3.12
+FROM python:${PYTHON_VERSION}-slim
+
+# Prevents Python from writing pyc files.
+ENV PYTHONDONTWRITEBYTECODE=1
+
+# Keeps Python from buffering stdout and stderr to avoid situations where
+# the application crashes without emitting any logs due to buffering.
+ENV PYTHONUNBUFFERED=1
+
+WORKDIR /app
+
+# Copy the source code into the container.
+COPY client/ .
+COPY grpc_config/ ./grpc_config/
+COPY requirements.txt . 
+
+# Download dependencies as a separate step to take advantage of Docker's caching.
+# 1. Leverage a cache mount to /root/.cache/pip to speed up subsequent builds (pip install doesn't need to be re-run each time
+# since packages are cached).
+# 2. install packages
+RUN --mount=type=cache,target=/root/.cache/pip \
+python -m pip install -r requirements.txt
+
+# Expose the port that the application will run on.
+EXPOSE 5001
+
+# Run the application.
+CMD ["uvicorn", "client:app", "--host", "0.0.0.0", "--port", "5001"]
@@ -0,0 +1,73 @@
+apiVersion: apps/v1
+kind: Deployment
+metadata:
+  name: grpc-client
+  labels:
+    app: grpc-client
+spec:
+  replicas: 1
+  selector:
+    matchLabels:
+      app: grpc-client
+  template:
+    metadata:
+      labels:
+        app: grpc-client
+    spec:
+      containers:
+        - name: grpc-client
+          image: localhost/grpc-client:latest  # replace with your built image
+          imagePullPolicy: IfNotPresent
+          ports:
+            - containerPort: 5001
+          env:
+            - name: GRPC_SERVER
+              value: "grpc-server-service:50051"
+
+---
+# internal service to target the producer
+apiVersion: v1
+kind: Service
+metadata:
+  name: grpc-client-service
+  labels:
+    app: grpc-client-service
+spec:
+  selector:
+    app: grpc-client
+  ports:
+    - protocol: TCP
+      port: 5001         # Port exposed by the service
+      targetPort: 5001   # Port on which the producer is listening
+  type: ClusterIP
+---
+
+# ingress to expose the producer service
+apiVersion: networking.k8s.io/v1
+kind: Ingress
+metadata:
+  name: grpc-client-ingress
+spec:
+  # this tells kubernetes
+  # which ingress controller to use for this ingress resource
+  # (useful in particular if you have multiple ingress controllers and need to
+  # associate a specific ingress resource with a specific ingress controller)
+  ingressClassName: nginx
+  rules:
+    - host: grpc-client.local
+      http:
+        paths:
+          - path: /raw-data
+            pathType: Prefix
+            backend:
+              service:
+                name: grpc-client-service
+                port:
+                  number: 5001  # Change to your service's target port
+          - path: /metrics
+            pathType: Prefix
+            backend:
+              service:
+                name: grpc-client-service
+                port:
+                  number: 5001  # Change to your service's target port