Skip to content

Commit 1210dce

Browse files
authored
Assign urls to edx contentfiles when possible (#2420)
1 parent 77e1d30 commit 1210dce

File tree

3 files changed

+364
-3
lines changed

3 files changed

+364
-3
lines changed

learning_resources/etl/utils.py

Lines changed: 125 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -18,10 +18,12 @@
1818
from pathlib import Path
1919
from subprocess import check_call
2020
from tempfile import TemporaryDirectory
21+
from typing import Optional
2122

2223
import boto3
2324
import rapidjson
2425
import requests
26+
from defusedxml import ElementTree
2527
from django.conf import settings
2628
from django.utils.dateparse import parse_duration
2729
from django.utils.text import slugify
@@ -341,7 +343,7 @@ def documents_from_olx(
341343
"mime_type": mimetype,
342344
"archive_checksum": archive_checksum,
343345
"file_extension": extension_lower,
344-
"source_path": f"{path}/{filename}",
346+
"source_path": f"{path}/{filename.replace(' ', '_')}",
345347
},
346348
)
347349

@@ -407,7 +409,126 @@ def text_from_sjson_content(content: str):
407409
return " ".join(data.get("text", []))
408410

409411

412+
def get_root_url_for_source(etl_source: str) -> tuple[str, str]:
413+
"""
414+
Get the base URL and path for an ETL source
415+
416+
Args:
417+
etl_source (str): The ETL source path
418+
419+
Returns:
420+
tuple[str, str]: The base URL and path
421+
"""
422+
mapping = {
423+
ETLSource.mitxonline.value: settings.CONTENT_BASE_URL_MITXONLINE,
424+
ETLSource.xpro.value: settings.CONTENT_BASE_URL_XPRO,
425+
ETLSource.oll.value: settings.CONTENT_BASE_URL_OLL,
426+
ETLSource.mit_edx.value: settings.CONTENT_BASE_URL_EDX,
427+
}
428+
return mapping.get(etl_source)
429+
430+
431+
def is_valid_uuid(uuid_string: str) -> bool:
432+
"""
433+
Check if a string is a valid UUID
434+
"""
435+
try:
436+
uuid.UUID(uuid_string)
437+
except ValueError:
438+
return False
439+
return True
440+
441+
442+
def get_url_from_module_id(
443+
module_id: str,
444+
run: LearningResourceRun,
445+
video_srt_metadata: Optional[dict] = None,
446+
) -> str:
447+
"""
448+
Get the URL for a module based on its ID
449+
450+
Args:
451+
module_id (str): The module ID
452+
run (LearningResourceRun): The run associated with the module
453+
454+
Returns:
455+
str: The URL for the module
456+
"""
457+
if not module_id:
458+
return None
459+
root_url = get_root_url_for_source(run.learning_resource.etl_source)
460+
# OLL needs to have 'course-v1:' added to the run_id
461+
run_id = (
462+
f"course-v1:{run.run_id}"
463+
if run.learning_resource.etl_source == ETLSource.oll.value
464+
else run.run_id
465+
)
466+
if module_id.startswith("asset"):
467+
video_meta = video_srt_metadata.get(module_id, {}) if video_srt_metadata else {}
468+
if video_meta:
469+
# Link to the parent video
470+
return f"{root_url}/courses/{run_id}/jump_to/{video_meta.split('@')[-1]}"
471+
return f"{root_url}/{module_id}"
472+
elif module_id.startswith("block") and is_valid_uuid(module_id.split("@")[-1]):
473+
return f"{root_url}/courses/{run_id}/jump_to_id/{module_id.split('@')[-1]}"
474+
else:
475+
return None
476+
477+
478+
def parse_video_transcripts_xml(
479+
run: LearningResourceRun, xml_content: str, path: Path
480+
) -> dict:
481+
"""
482+
Parse video XML content and create a mapping of
483+
transcript edx_module_id to video edx_module_id
484+
"""
485+
transcript_mapping = {}
486+
try:
487+
root = ElementTree.fromstring(xml_content)
488+
489+
# Get the video url_name from the root video element
490+
video_url_name = root.get("url_name")
491+
if not video_url_name:
492+
log.warning("No url_name found in video XML")
493+
return {}
494+
495+
# Find all transcript elements and extract their src attributes
496+
for transcript in root.findall(".//transcript"):
497+
transcript_src = transcript.get("src")
498+
if transcript_src:
499+
transcript_mapping[
500+
get_edx_module_id(f"static/{transcript_src}", run)
501+
] = get_edx_module_id(str(path), run)
502+
except ElementTree.ParseError:
503+
log.exception("Error parsing video XML for %s: %s", run, path)
504+
return transcript_mapping
505+
506+
507+
def get_video_metadata(olx_path: str, run: LearningResourceRun) -> dict:
508+
"""
509+
Get metadata for video SRT/VTT files in an OLX path
510+
"""
511+
video_transcript_mapping = {}
512+
video_path = Path(olx_path, "video")
513+
if not video_path.exists():
514+
log.debug("No video directory found in OLX path: %s", olx_path)
515+
return video_transcript_mapping
516+
for root, _, files in os.walk(str(Path(olx_path, "video"))):
517+
for filename in files:
518+
extension_lower = Path(filename).suffix.lower()
519+
if extension_lower == ".xml":
520+
with Path.open(Path(root, filename), "rb") as f:
521+
video_xml = f.read().decode("utf-8")
522+
523+
# Parse the XML and get transcript mappings
524+
transcript_mapping = parse_video_transcripts_xml(run, video_xml, f)
525+
video_transcript_mapping.update(transcript_mapping)
526+
527+
return video_transcript_mapping
528+
529+
410530
def _process_olx_path(olx_path: str, run: LearningResourceRun, *, overwrite):
531+
video_srt_metadata = get_video_metadata(olx_path, run)
411532
for document, metadata in documents_from_olx(olx_path):
412533
source_path = metadata.get("source_path")
413534
edx_module_id = get_edx_module_id(source_path, run)
@@ -465,6 +586,7 @@ def _process_olx_path(olx_path: str, run: LearningResourceRun, *, overwrite):
465586
"file_extension": file_extension,
466587
"source_path": source_path,
467588
"edx_module_id": edx_module_id,
589+
"url": get_url_from_module_id(edx_module_id, run, video_srt_metadata),
468590
**content_dict,
469591
}
470592
)
@@ -741,7 +863,7 @@ def parse_certification(offeror, runs_data):
741863
)
742864

743865

744-
def iso8601_duration(duration_str: str) -> str or None:
866+
def iso8601_duration(duration_str: str) -> str | None:
745867
"""
746868
Parse the duration from a string and return it in ISO-8601 format
747869
@@ -821,7 +943,7 @@ def calculate_weeks(num: int, from_unit: str) -> int:
821943
return num
822944

823945

824-
def transform_interval(interval_txt: str) -> str or None:
946+
def transform_interval(interval_txt: str) -> str | None:
825947
"""
826948
Transform any interval units to standard English units
827949
Only languages currently supported are English and Spanish

0 commit comments

Comments
 (0)