Skip to content

Commit b91af41

Browse files
authored
Merge pull request #17 from codefortulsa/try-prefect-for-orchestration
add prefect to organize into flows and tasks
2 parents c42b85c + 2355e6a commit b91af41

18 files changed

+535
-8242
lines changed

.gitignore

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,6 @@
11
# Environment variables
22
.env
3+
.envrc
34

45
data/
56
models/
@@ -16,6 +17,7 @@ __pycache__/
1617
*.py[cod]
1718
*.so
1819
.Python
20+
.python-version
1921
build/
2022
develop-eggs/
2123
dist/

README.md

Lines changed: 27 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -14,17 +14,35 @@ poetry install --no-root
1414
poetry self add poetry-plugin-shell
1515
poetry shell
1616

17-
# Install Jupyter kernel for this environment (needed for Jupyter notebooks)
18-
python -m ipykernel install --user --name=tgov-scraper --display-name="TGOV Scraper"
17+
# Set up pre-commit hooks
18+
poetry run pre-commit install
19+
20+
# Verify pre-commit hooks are working
21+
poetry run pre-commit run --all-files
22+
23+
# See notebook_precommit.md for more details on how notebook outputs are automatically stripped
1924
```
2025

2126
## Running
27+
### Jupyter notebooks
2228

2329
```bash
30+
# Install Jupyter kernel for this environment (needed for Jupyter notebooks)
31+
python -m ipykernel install --user --name=tgov-scraper --display-name="TGOV Scraper"
32+
2433
jupyter notebook
2534
```
2635

27-
## Running Tests
36+
### Prefect flows
37+
See https://docs.prefect.io/get-started
38+
39+
```bash
40+
prefect server start # to start the persistent server
41+
42+
python -m flows.translate_meetings # to run a specific flow
43+
```
44+
45+
### Tests
2846

2947
```bash
3048
# Run all tests
@@ -39,12 +57,16 @@ pytest -v
3957

4058
## Project Structure
4159

60+
- `data/`: local data artifacts
61+
- `flows/`: prefect flows
62+
- `notebooks/`: Jupyter notebooks for analysis and exploration
63+
- `scripts/`: one off scripts for downloading, conversions, etc
4264
- `src/`: Source code for the scraper
4365
- `models/`: Pydantic models for data representation
44-
- 'scripts`: one off scripts for downloading, conversions, etc
66+
- `tasks/`: prefect tasks
4567
- `tests/`: Test files
4668
- `notebooks/`: Jupyter notebooks for analysis and exploration
47-
- `data/`: output from notebooks
69+
- `data/`: output from notebooks
4870

4971

5072
## Running the transcription scripts

data/meetings.csv

Lines changed: 0 additions & 7153 deletions
This file was deleted.

flows/__init__.py

Whitespace-only changes.

flows/translate_meetings.py

Lines changed: 17 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,17 @@
1+
from prefect import flow
2+
3+
from tasks.meetings import create_meetings_csv
4+
5+
6+
@flow(log_prints=True)
7+
async def translate_meetings():
8+
await create_meetings_csv()
9+
# TODO: await download_videos()
10+
# TODO: await transcribe_videos()
11+
# TODO: await diarize_transcriptions()
12+
# TODO: await translate_transcriptions()
13+
# TODO: await create_subtitled_video_pages()
14+
15+
if __name__ == "__main__":
16+
import asyncio
17+
asyncio.run(translate_meetings())
File renamed without changes.

notebooks/meetings.ipynb

Lines changed: 367 additions & 42 deletions
Large diffs are not rendered by default.

notebooks/vtt_subtitles.ipynb

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -70,17 +70,17 @@
7070
"\n",
7171
"# Import from the new subtitles module\n",
7272
"from src.subtitles import create_track, load_transcript\n",
73-
"from src.models.subtitles import TrackFormat\n",
73+
"from src.models.subtitles import SubtitleTrack\n",
7474
"\n",
7575
"# Path to the transcript file\n",
7676
"transcript_file = Path(\n",
7777
" \"../data/transcripts/regular_council_meeting___2025_02_26.diarized.json\"\n",
7878
")\n",
7979
"\n",
8080
"# Create VTT track\n",
81-
"vtt_track = create_track(\n",
81+
"vtt_track: SubtitleTrack = create_track(\n",
8282
" transcript_data=transcript_file,\n",
83-
" track_format='vtt',\n",
83+
" format='vtt',\n",
8484
" max_duration=5.0,\n",
8585
" include_speaker_prefix=False,\n",
8686
")\n",

pyproject.toml

Lines changed: 8 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -2,11 +2,11 @@
22
name = "tgov scraper"
33
version = "0.1.0"
44
description = "A set of scripts and notebooks for exploring Tulsa Government Access Television"
5-
authors = ["jdungan <johnadungan@gmail.com>"]
5+
authors = ["jdungan <johnadungan@gmail.com>", "groovecoder <luke@groovecoder.com>"]
66
readme = "README.md"
77

88
[tool.poetry.dependencies]
9-
python = "3.11.*"
9+
python = ">=3.11,<3.13"
1010
selectolax = "^0.3.28"
1111
aiohttp = "^3.11.13"
1212
pytest-asyncio = "^0.25.3"
@@ -25,14 +25,17 @@ jupyter-nbextensions-configurator = "^0.6.4"
2525
python-dotenv = "^1.0.1"
2626
aiofiles = "^24.1.0"
2727
faster-whisper = "^1.1.1"
28+
prefect = "^3.3.0"
29+
boto3 = "^1.37.24"
2830

2931

3032
[tool.poetry.group.dev.dependencies]
31-
jupyter = "^1.1.1"
33+
ipdb = "^0.13.13"
3234
ipykernel = "^6.29.5"
33-
pytest = "^8.0.0"
34-
pre-commit = "^4.2.0"
35+
jupyter = "^1.1.1"
3536
nbstripout = "^0.8.1"
37+
pre-commit = "^4.2.0"
38+
pytest = "^8.0.0"
3639

3740
[build-system]
3841
requires = ["poetry-core"]

scripts/download_m3u8.py

Lines changed: 0 additions & 176 deletions
This file was deleted.

0 commit comments

Comments
 (0)