-
Notifications
You must be signed in to change notification settings - Fork 1
Strivers list #1
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from all commits
e9d92f5
34a89ee
691484b
72e097e
081438d
e2f980e
bfb2329
363de37
b772f87
51d6eaf
a6a8e92
9c2b349
4852ec7
c14a578
5f92781
c249a8c
ca844ec
544d07a
e3ef156
4c8083f
d080479
5ee330d
e18e9ca
69c4361
a9f6f18
8a60340
636e261
d8a9d9c
c9e4b32
8468da1
4f3b410
beab043
496370b
a4c579b
eb74837
cc4a1ce
967af30
0f766ed
c8c16f1
a876434
5364dbc
86b2b57
511ee34
530ea77
f14f44b
be0b3d1
cd76756
54a0736
dadb875
beceeeb
01dc146
a3f0720
ddddf3b
930cdd2
cb949bf
054e780
025ba22
599ddcf
aab402d
a627a18
4c83740
8ac884e
51d614b
fe94c60
d0af626
7b277c7
c26eff5
4ced8c0
75274b1
1095aac
973a88b
8ad432c
fcefe84
e31ffaf
23818be
707f120
6d88b96
4034fbb
555cf8f
17f1345
149fb11
05db9c2
f779c98
5c523d1
6107a84
4899227
605c2e2
7ec8356
d312fa1
1909d03
50a3343
8f19f6b
c8d8c3f
e4c3515
ead1662
ad94d07
622714f
f758134
7ca15e4
125001a
b2d7743
15743ff
dda61dc
0fde53a
6ef4e35
7a7ecfb
8236198
392f9eb
8d4ba3e
ac1989c
83483e6
dc63992
b3fa913
e052f24
e5f2939
53f19d9
dfe6685
d012bb9
df0efec
f778c07
2acc163
7cde71f
7902926
c85de52
e0b6fda
c1aa697
e13e1e9
933b497
7ad899c
f490ed2
0c8a734
ea9ceb6
fc179d9
407d856
f04fd46
433ed4c
0068d17
ca15c46
3159b13
62b252e
8f54d93
6f66c22
ef6b4c8
3ab7efb
9d068dd
4057fc2
d8c5234
4d69177
f14946e
54b307b
72e0fdb
9b6b355
e9cdafe
f4624ae
6b12f7c
b229997
8af191c
a75211e
e3eed2e
4c65818
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -54,4 +54,7 @@ __pycache__/ | |
|
|
||
| # Backup files | ||
| *.bak | ||
| *.backup | ||
| *.backup | ||
|
|
||
| # Progress tracking | ||
| .striver_progress.json | ||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,156 @@ | ||
| #!/usr/bin/env python3 | ||
| """ | ||
| Extract problems from DSA Sheets.pdf | ||
| Hybrid approach: extract all problems, improve slug extraction | ||
| """ | ||
| import pdfplumber | ||
| import re | ||
| from pathlib import Path | ||
| import json | ||
|
|
||
| ROOT = Path(__file__).resolve().parents[1] | ||
| PDF_PATH = ROOT / "DSA Sheets.pdf" | ||
| OUTPUT_JSON = ROOT / "striver_sde_sheet_problems.json" | ||
|
|
||
| def extract_problems_from_pdf(): | ||
| """Extract all problems from the PDF""" | ||
| problems = [] | ||
|
|
||
| print(f"Reading PDF: {PDF_PATH}") | ||
| with pdfplumber.open(PDF_PATH) as pdf: | ||
| full_text = "" | ||
| for page_num, page in enumerate(pdf.pages, 1): | ||
| text = page.extract_text() | ||
| if text: | ||
| full_text += text + "\n" | ||
| if page_num % 10 == 0: | ||
| print(f"Processed {page_num} pages...") | ||
|
|
||
| print(f"Total pages: {len(pdf.pages)}") | ||
| print(f"Extracted text length: {len(full_text)} characters") | ||
|
|
||
| # First pass: Extract all LeetCode URLs with complete slugs | ||
| # Normalize text to handle line breaks in URLs | ||
| normalized_text = full_text.replace('\n', ' ').replace(' ', ' ') | ||
|
|
||
| # Find all complete LeetCode URLs | ||
| leetcode_pattern = r'(?:https?://)?(?:www\.)?leetcode\.com/problems/([a-z0-9-]+)/?' | ||
| all_slug_matches = {} | ||
| for match in re.finditer(leetcode_pattern, normalized_text, re.IGNORECASE): | ||
| slug = match.group(1).lower() | ||
| # Only keep complete slugs (more than 5 chars, doesn't end with hyphen) | ||
| if len(slug) > 5 and not slug.endswith('-'): | ||
| # Get context to find problem number | ||
| start = max(0, match.start() - 100) | ||
| end = min(len(normalized_text), match.end() + 50) | ||
| context = normalized_text[start:end] | ||
| # Try to find problem number nearby | ||
| num_match = re.search(r'\b(\d{1,3})\.?\s', context) | ||
| if num_match: | ||
| problem_num = int(num_match.group(1)) | ||
| if slug not in all_slug_matches: | ||
| all_slug_matches[slug] = problem_num | ||
|
|
||
| print(f"Found {len(all_slug_matches)} complete LeetCode slugs with problem numbers") | ||
|
|
||
| # Second pass: Extract problems line by line (original method) | ||
| lines = full_text.split('\n') | ||
| current_category = None | ||
| problem_num = 0 | ||
|
|
||
| for i, line in enumerate(lines): | ||
| line = line.strip() | ||
| if not line: | ||
| continue | ||
|
|
||
| # Detect category headers | ||
| if len(line) > 3 and line.isupper() and len(line.split()) < 10: | ||
| current_category = line | ||
| print(f"Found category: {current_category}") | ||
| continue | ||
|
|
||
| # Look for problem numbers | ||
| problem_match = re.match(r'^(\d+)\.?\s*(.+)', line) | ||
| if problem_match: | ||
| num = problem_match.group(1) | ||
| rest = problem_match.group(2).strip() | ||
|
|
||
| # Build search text from current line and next 2 lines (for split URLs) | ||
| search_text = line | ||
| for j in range(i+1, min(i+3, len(lines))): | ||
| if lines[j].strip(): | ||
| search_text += " " + lines[j].strip() | ||
|
|
||
| # Normalize for URL matching (remove spaces that might break URLs) | ||
| search_normalized = re.sub(r'([a-z0-9-])\s+([a-z0-9-])', r'\1\2', search_text, flags=re.IGNORECASE) | ||
|
|
||
| # Try to extract LeetCode link | ||
| link_match = re.search(leetcode_pattern, search_normalized, re.IGNORECASE) | ||
| title_slug = None | ||
| if link_match: | ||
| potential_slug = link_match.group(1).lower() | ||
| # Only use if it's a complete slug | ||
| if len(potential_slug) > 5 and not potential_slug.endswith('-'): | ||
| title_slug = potential_slug | ||
|
|
||
| # Also check if this problem number has a slug in our map | ||
| if not title_slug and int(num) in all_slug_matches.values(): | ||
| # Find slug for this problem number | ||
| for slug, pnum in all_slug_matches.items(): | ||
| if pnum == int(num): | ||
| title_slug = slug | ||
| break | ||
|
|
||
| # Extract problem title | ||
| title = re.sub(r'https?://[^\s]+', '', rest).strip() | ||
| title = re.sub(r'\([^)]*\)', '', title).strip() | ||
| title = re.sub(r'\[.*?\]', '', title).strip() | ||
| if not title and title_slug: | ||
| title = title_slug.replace('-', ' ').title() | ||
|
|
||
| if title or title_slug: | ||
| problem_num += 1 | ||
| problem = { | ||
| "number": problem_num, | ||
| "category": current_category or "Unknown", | ||
| "title": title, | ||
| "title_slug": title_slug, | ||
| "raw_line": line | ||
| } | ||
| problems.append(problem) | ||
| if problem_num % 50 == 0: | ||
| print(f"Extracted {problem_num} problems...") | ||
|
|
||
| # Add any slugs we found that weren't in the numbered list | ||
| existing_slugs = {p.get("title_slug") for p in problems if p.get("title_slug")} | ||
| for slug, pnum in all_slug_matches.items(): | ||
| if slug not in existing_slugs: | ||
| # Check if we already have this problem number | ||
| existing_for_num = [p for p in problems if p["number"] == pnum] | ||
|
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Bug: Comparison uses incompatible numbering systemsThe comparison |
||
| if not existing_for_num: | ||
| problem_num += 1 | ||
| problems.append({ | ||
| "number": problem_num, | ||
| "category": "Unknown", | ||
| "title": slug.replace('-', ' ').title(), | ||
| "title_slug": slug, | ||
| "raw_line": f"leetcode.com/problems/{slug}" | ||
| }) | ||
|
|
||
| # Count problems with slugs | ||
| with_slugs = [p for p in problems if p.get("title_slug") and len(p["title_slug"]) > 5] | ||
| print(f"\nProblems with valid slugs: {len(with_slugs)}") | ||
| print(f"Total problems extracted: {len(problems)}") | ||
|
|
||
| return problems | ||
|
|
||
| def save_problems(problems): | ||
| """Save problems to JSON file""" | ||
| with open(OUTPUT_JSON, 'w', encoding='utf-8') as f: | ||
| json.dump(problems, f, indent=2, ensure_ascii=False) | ||
| print(f"Saved {len(problems)} problems to {OUTPUT_JSON}") | ||
|
|
||
| if __name__ == "__main__": | ||
| problems = extract_problems_from_pdf() | ||
| save_problems(problems) | ||
| print(f"\n✅ Extraction complete! Found {len(problems)} problems") | ||
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Bug: Variable accessed after context manager closes
The
pdfvariable is accessed at line 29 after thewith pdfplumber.open(PDF_PATH) as pdf:block closes at line 27. At this point, thepdfobject is no longer valid because the context manager has already closed it. Accessingpdf.pagesoutside thewithblock will fail or produce undefined behavior.