codefortulsa
diff --git a/‎.gitignore‎
Lines changed: 3 additions & 1 deletion b/‎.gitignore‎
Lines changed: 3 additions & 1 deletion
diff --git a/‎.pre-commit-config.yaml‎
Lines changed: 17 additions & 0 deletions b/‎.pre-commit-config.yaml‎
Lines changed: 17 additions & 0 deletions
diff --git a/‎NOTEBOOK_GUIDELINES.md‎
Lines changed: 53 additions & 0 deletions b/‎NOTEBOOK_GUIDELINES.md‎
Lines changed: 53 additions & 0 deletions
diff --git a/‎notebooks/meetings.ipynb‎
Lines changed: 247 additions & 599 deletions b/‎notebooks/meetings.ipynb‎
Lines changed: 247 additions & 599 deletions
diff --git a/‎notebooks/roll_call.ipynb‎
Lines changed: 7 additions & 147 deletions b/‎notebooks/roll_call.ipynb‎
Lines changed: 7 additions & 147 deletions
@@ -6,7 +6,10 @@ models/
 # Include specific directories
 !src/models/
 
+# Jupyter notebook
 notebooks/.ipynb_checkpoints/
+.ipynb_checkpoints/
+*/.ipynb_checkpoints/*
 
 # Python
 __pycache__/
@@ -47,4 +50,3 @@ build/
 npm-debug.log*
 yarn-debug.log*
 yarn-error.log*
-
 
@@ -0,0 +1,17 @@
+repos:
+- repo: https://github.com/kynan/nbstripout
+  rev: 0.7.1
+  hooks:
+    - id: nbstripout
+      name: Strip Jupyter notebook output cells
+      description: Clear output from Jupyter notebooks before committing
+      files: \.ipynb$
+
+- repo: https://github.com/pre-commit/pre-commit-hooks
+  rev: v4.5.0
+  hooks:
+    - id: trailing-whitespace
+    - id: end-of-file-fixer
+    - id: check-yaml
+    - id: check-added-large-files
+      args: ['--maxkb=500']
@@ -0,0 +1,53 @@
+# Jupyter Notebook Guidelines
+
+## Automatic Output Stripping
+
+This repository is configured with a pre-commit hook that automatically strips output cells from Jupyter notebooks before they are committed to Git. This helps keep the repository size manageable by avoiding the storage of large outputs such as images, graphs, and videos in the Git history.
+
+### How It Works
+
+1. The `nbstripout` pre-commit hook is configured to run automatically before each commit.
+2. It removes all output cells, execution counts, and metadata from notebooks.
+3. Your notebook file will be stripped only in the Git repository - your local file will keep its outputs.
+
+### Setup for New Contributors
+
+If you're newly cloning this repository, you need to set up the pre-commit hooks:
+
+```bash
+# Install poetry dependencies including pre-commit tools
+poetry install
+
+# Install the pre-commit hooks
+poetry run pre-commit install
+```
+
+### Testing the Setup
+
+To verify that the pre-commit hooks are working correctly, you can run:
+
+```bash
+poetry run pre-commit run --all-files
+```
+
+### Manual Stripping
+
+If you need to manually strip outputs from a notebook, run:
+
+```bash
+poetry run nbstripout notebooks/your_notebook.ipynb
+```
+
+## Best Practices
+
+1. **Keep Large Data Outside Git**: Store large datasets separately (e.g., data/ directory which is gitignored).
+2. **Avoid Embedding Large Files**: Don't embed videos, large images, or other binary data directly in notebooks.
+3. **Document Data Sources**: Always include information on how to obtain data needed for your notebooks.
+4. **Separate Code and Content**: Use markdown cells to document your analysis thoroughly.
+
+## Troubleshooting
+
+If you encounter issues with the pre-commit hooks, ensure:
+- You have run `poetry install` to install all dependencies
+- You have run `poetry run pre-commit install` to set up the hooks
+- You are committing from within the Poetry environment or using `poetry run git commit`
@@ -11,7 +11,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 5,
+   "execution_count": null,
    "metadata": {},
    "outputs": [],
    "source": [
@@ -31,17 +31,9 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 6,
+   "execution_count": null,
    "metadata": {},
-   "outputs": [
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "Clip successfully extracted to: ../data/video/regular_council_meeting___2025_02_26_clip_4-50_to_5-20.mp4\n"
-     ]
-    }
-   ],
+   "outputs": [],
    "source": [
     "import subprocess\n",
     "from pathlib import Path\n",
@@ -97,126 +89,9 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 7,
+   "execution_count": null,
    "metadata": {},
-   "outputs": [
-    {
-     "name": "stderr",
-     "output_type": "stream",
-     "text": [
-      "INFO:src.videos:Transcribing video with speaker diarization: ../data/video/regular_council_meeting___2025_02_26_clip_4-50_to_5-20.mp4\n",
-      "INFO:src.videos:Output will be saved to: ../data/transcripts/regular_council_meeting___2025_02_26_clip_4-50_to_5-20.diarized.json\n",
-      "INFO:src.huggingface:Auto-detected device: cpu\n",
-      "INFO:src.huggingface:Auto-selected compute_type: int8\n",
-      "INFO:src.huggingface:Loading WhisperX model: tiny on cpu with int8 precision\n"
-     ]
-    },
-    {
-     "data": {
-      "application/vnd.jupyter.widget-view+json": {
-       "model_id": "168afa65d3ae4108af591eb1993fe482",
-       "version_major": 2,
-       "version_minor": 0
-      },
-      "text/plain": [
-       "tokenizer.json:   0%|          | 0.00/2.20M [00:00<?, ?B/s]"
-      ]
-     },
-     "metadata": {},
-     "output_type": "display_data"
-    },
-    {
-     "data": {
-      "application/vnd.jupyter.widget-view+json": {
-       "model_id": "89d35faecb8e447db3ccb95407e2a775",
-       "version_major": 2,
-       "version_minor": 0
-      },
-      "text/plain": [
-       "config.json:   0%|          | 0.00/2.25k [00:00<?, ?B/s]"
-      ]
-     },
-     "metadata": {},
-     "output_type": "display_data"
-    },
-    {
-     "data": {
-      "application/vnd.jupyter.widget-view+json": {
-       "model_id": "f616039556ee46aaaee2f975f016aeb0",
-       "version_major": 2,
-       "version_minor": 0
-      },
-      "text/plain": [
-       "vocabulary.txt:   0%|          | 0.00/460k [00:00<?, ?B/s]"
-      ]
-     },
-     "metadata": {},
-     "output_type": "display_data"
-    },
-    {
-     "data": {
-      "application/vnd.jupyter.widget-view+json": {
-       "model_id": "50bd4e88d6084638b91847587cc9ed0a",
-       "version_major": 2,
-       "version_minor": 0
-      },
-      "text/plain": [
-       "model.bin:   0%|          | 0.00/75.5M [00:00<?, ?B/s]"
-      ]
-     },
-     "metadata": {},
-     "output_type": "display_data"
-    },
-    {
-     "name": "stderr",
-     "output_type": "stream",
-     "text": [
-      "Lightning automatically upgraded your loaded checkpoint from v1.5.4 to v2.5.0.post0. To apply the upgrade to your files permanently, run `python -m pytorch_lightning.utilities.upgrade_checkpoint ../../../../Library/Caches/pypoetry/virtualenvs/tgov_scraper-zRR99ne3-py3.11/lib/python3.11/site-packages/whisperx/assets/pytorch_model.bin`\n",
-      "INFO:src.huggingface:Loading diarization pipeline\n"
-     ]
-    },
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "No language specified, language will be first be detected for each audio file (increases inference time).\n",
-      ">>Performing voice activity detection using Pyannote...\n",
-      "Model was trained with pyannote.audio 0.0.1, yours is 3.3.2. Bad things might happen unless you revert pyannote.audio to 0.x.\n",
-      "Model was trained with torch 1.10.0+cu102, yours is 2.4.1. Bad things might happen unless you revert torch to 1.x.\n"
-     ]
-    },
-    {
-     "name": "stderr",
-     "output_type": "stream",
-     "text": [
-      "INFO:src.huggingface:WhisperX model loaded in 4.50 seconds\n",
-      "INFO:src.videos:Running initial transcription with batch size 8...\n"
-     ]
-    },
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "Detected language: en (0.99) in first 30s of audio...\n"
-     ]
-    },
-    {
-     "name": "stderr",
-     "output_type": "stream",
-     "text": [
-      "INFO:src.videos:Detected language: en\n",
-      "INFO:src.videos:Loading alignment model for detected language: en\n",
-      "INFO:src.videos:Aligning transcription with audio...\n",
-      "INFO:src.videos:Running speaker diarization...\n",
-      "/Users/owner/Library/Caches/pypoetry/virtualenvs/tgov_scraper-zRR99ne3-py3.11/lib/python3.11/site-packages/pyannote/audio/models/blocks/pooling.py:104: UserWarning: std(): degrees of freedom is <= 0. Correction should be strictly less than the reduction factor (input numel divided by output numel). (Triggered internally at /Users/runner/work/pytorch/pytorch/pytorch/aten/src/ATen/native/ReduceOps.cpp:1808.)\n",
-      "  std = sequences.std(dim=-1, correction=1)\n",
-      "INFO:src.videos:Assigning speakers to transcription...\n",
-      "INFO:src.videos:Processing transcription segments...\n",
-      "INFO:src.videos:Diarized transcription completed in 30.03 seconds\n",
-      "INFO:src.videos:Detailed JSON saved to: ../data/transcripts/regular_council_meeting___2025_02_26_clip_4-50_to_5-20.diarized.json\n"
-     ]
-    }
-   ],
+   "outputs": [],
    "source": [
     "from src.videos import transcribe_video_with_diarization\n",
     "\n",
@@ -231,24 +106,9 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 8,
+   "execution_count": null,
    "metadata": {},
-   "outputs": [
-    {
-     "data": {
-      "application/vnd.jupyter.widget-view+json": {
-       "model_id": "5d97ff70c1c3409da83c10c478f2bfaa",
-       "version_major": 2,
-       "version_minor": 0
-      },
-      "text/plain": [
-       "HTML(value='<h3>Meeting Script</h3><hr><p><b>[00:00:00] SPEAKER_01:</b><br>Thank you, Mr. Huffinds. Any counci…"
-      ]
-     },
-     "metadata": {},
-     "output_type": "display_data"
-    }
-   ],
+   "outputs": [],
    "source": [
     "def format_timestamp(seconds: float) -> str:\n",
     "    \"\"\"Convert seconds to HH:MM:SS format\"\"\"\n",