Skip to content

Commit d1c7eb8

Browse files
Merge pull request #1 from TechnoBlogger14o3/strivers_list
Strivers list
2 parents 8b0a5b9 + 4c65818 commit d1c7eb8

File tree

329 files changed

+11107
-1
lines changed

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

329 files changed

+11107
-1
lines changed

.gitignore

Lines changed: 4 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -54,4 +54,7 @@ __pycache__/
5454

5555
# Backup files
5656
*.bak
57-
*.backup
57+
*.backup
58+
59+
# Progress tracking
60+
.striver_progress.json

DSA Sheets.pdf

540 KB
Binary file not shown.
Lines changed: 156 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,156 @@
1+
#!/usr/bin/env python3
2+
"""
3+
Extract problems from DSA Sheets.pdf
4+
Hybrid approach: extract all problems, improve slug extraction
5+
"""
6+
import pdfplumber
7+
import re
8+
from pathlib import Path
9+
import json
10+
11+
ROOT = Path(__file__).resolve().parents[1]
12+
PDF_PATH = ROOT / "DSA Sheets.pdf"
13+
OUTPUT_JSON = ROOT / "striver_sde_sheet_problems.json"
14+
15+
def extract_problems_from_pdf():
16+
"""Extract all problems from the PDF"""
17+
problems = []
18+
19+
print(f"Reading PDF: {PDF_PATH}")
20+
with pdfplumber.open(PDF_PATH) as pdf:
21+
full_text = ""
22+
for page_num, page in enumerate(pdf.pages, 1):
23+
text = page.extract_text()
24+
if text:
25+
full_text += text + "\n"
26+
if page_num % 10 == 0:
27+
print(f"Processed {page_num} pages...")
28+
29+
print(f"Total pages: {len(pdf.pages)}")
30+
print(f"Extracted text length: {len(full_text)} characters")
31+
32+
# First pass: Extract all LeetCode URLs with complete slugs
33+
# Normalize text to handle line breaks in URLs
34+
normalized_text = full_text.replace('\n', ' ').replace(' ', ' ')
35+
36+
# Find all complete LeetCode URLs
37+
leetcode_pattern = r'(?:https?://)?(?:www\.)?leetcode\.com/problems/([a-z0-9-]+)/?'
38+
all_slug_matches = {}
39+
for match in re.finditer(leetcode_pattern, normalized_text, re.IGNORECASE):
40+
slug = match.group(1).lower()
41+
# Only keep complete slugs (more than 5 chars, doesn't end with hyphen)
42+
if len(slug) > 5 and not slug.endswith('-'):
43+
# Get context to find problem number
44+
start = max(0, match.start() - 100)
45+
end = min(len(normalized_text), match.end() + 50)
46+
context = normalized_text[start:end]
47+
# Try to find problem number nearby
48+
num_match = re.search(r'\b(\d{1,3})\.?\s', context)
49+
if num_match:
50+
problem_num = int(num_match.group(1))
51+
if slug not in all_slug_matches:
52+
all_slug_matches[slug] = problem_num
53+
54+
print(f"Found {len(all_slug_matches)} complete LeetCode slugs with problem numbers")
55+
56+
# Second pass: Extract problems line by line (original method)
57+
lines = full_text.split('\n')
58+
current_category = None
59+
problem_num = 0
60+
61+
for i, line in enumerate(lines):
62+
line = line.strip()
63+
if not line:
64+
continue
65+
66+
# Detect category headers
67+
if len(line) > 3 and line.isupper() and len(line.split()) < 10:
68+
current_category = line
69+
print(f"Found category: {current_category}")
70+
continue
71+
72+
# Look for problem numbers
73+
problem_match = re.match(r'^(\d+)\.?\s*(.+)', line)
74+
if problem_match:
75+
num = problem_match.group(1)
76+
rest = problem_match.group(2).strip()
77+
78+
# Build search text from current line and next 2 lines (for split URLs)
79+
search_text = line
80+
for j in range(i+1, min(i+3, len(lines))):
81+
if lines[j].strip():
82+
search_text += " " + lines[j].strip()
83+
84+
# Normalize for URL matching (remove spaces that might break URLs)
85+
search_normalized = re.sub(r'([a-z0-9-])\s+([a-z0-9-])', r'\1\2', search_text, flags=re.IGNORECASE)
86+
87+
# Try to extract LeetCode link
88+
link_match = re.search(leetcode_pattern, search_normalized, re.IGNORECASE)
89+
title_slug = None
90+
if link_match:
91+
potential_slug = link_match.group(1).lower()
92+
# Only use if it's a complete slug
93+
if len(potential_slug) > 5 and not potential_slug.endswith('-'):
94+
title_slug = potential_slug
95+
96+
# Also check if this problem number has a slug in our map
97+
if not title_slug and int(num) in all_slug_matches.values():
98+
# Find slug for this problem number
99+
for slug, pnum in all_slug_matches.items():
100+
if pnum == int(num):
101+
title_slug = slug
102+
break
103+
104+
# Extract problem title
105+
title = re.sub(r'https?://[^\s]+', '', rest).strip()
106+
title = re.sub(r'\([^)]*\)', '', title).strip()
107+
title = re.sub(r'\[.*?\]', '', title).strip()
108+
if not title and title_slug:
109+
title = title_slug.replace('-', ' ').title()
110+
111+
if title or title_slug:
112+
problem_num += 1
113+
problem = {
114+
"number": problem_num,
115+
"category": current_category or "Unknown",
116+
"title": title,
117+
"title_slug": title_slug,
118+
"raw_line": line
119+
}
120+
problems.append(problem)
121+
if problem_num % 50 == 0:
122+
print(f"Extracted {problem_num} problems...")
123+
124+
# Add any slugs we found that weren't in the numbered list
125+
existing_slugs = {p.get("title_slug") for p in problems if p.get("title_slug")}
126+
for slug, pnum in all_slug_matches.items():
127+
if slug not in existing_slugs:
128+
# Check if we already have this problem number
129+
existing_for_num = [p for p in problems if p["number"] == pnum]
130+
if not existing_for_num:
131+
problem_num += 1
132+
problems.append({
133+
"number": problem_num,
134+
"category": "Unknown",
135+
"title": slug.replace('-', ' ').title(),
136+
"title_slug": slug,
137+
"raw_line": f"leetcode.com/problems/{slug}"
138+
})
139+
140+
# Count problems with slugs
141+
with_slugs = [p for p in problems if p.get("title_slug") and len(p["title_slug"]) > 5]
142+
print(f"\nProblems with valid slugs: {len(with_slugs)}")
143+
print(f"Total problems extracted: {len(problems)}")
144+
145+
return problems
146+
147+
def save_problems(problems):
148+
"""Save problems to JSON file"""
149+
with open(OUTPUT_JSON, 'w', encoding='utf-8') as f:
150+
json.dump(problems, f, indent=2, ensure_ascii=False)
151+
print(f"Saved {len(problems)} problems to {OUTPUT_JSON}")
152+
153+
if __name__ == "__main__":
154+
problems = extract_problems_from_pdf()
155+
save_problems(problems)
156+
print(f"\n✅ Extraction complete! Found {len(problems)} problems")

0 commit comments

Comments
 (0)