Skip to content

Commit ef64ec4

Browse files
authored
feat!: 通过selenium接入了统一身份认证 (#239)
* 接入清华大学统一身份认证(selenium) * 避免文件名出现无效字符 * 记录下载文件时的错误信息 * 优化进度条显示 * 增加-all功能,下载全部学期
1 parent 6a09df9 commit ef64ec4

File tree

8 files changed

+414
-85
lines changed

8 files changed

+414
-85
lines changed

pyproject.toml

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -18,6 +18,7 @@ python = "^3.10"
1818
python-dateutil = "^2.8.2"
1919
requests = "^2.31.0"
2020
rich = "^13.7.1"
21+
selenium = "^4.15.0"
2122
tenacity = "^9.0.0"
2223
typer = "^0.16.0"
2324

thu_learn_downloader/client/client.py

Lines changed: 10 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -26,5 +26,13 @@ def get_with_token(
2626
return self.get(url=url, params={**params, "_csrf": self.token})
2727

2828
@property
29-
def token(self) -> None:
30-
return self.cookies["XSRF-TOKEN"]
29+
def token(self) -> str:
30+
try:
31+
token = self.cookies.get("XSRF-TOKEN")
32+
if token is None:
33+
raise KeyError("XSRF-TOKEN not found in cookies")
34+
return token
35+
except KeyError as e:
36+
print(f"无法获取CSRF token: {e}")
37+
print(f"当前cookies: {list(self.cookies.keys())}")
38+
raise Exception("登录状态可能已失效,请重新登录") from e

thu_learn_downloader/client/course.py

Lines changed: 94 additions & 20 deletions
Original file line numberDiff line numberDiff line change
@@ -10,6 +10,26 @@
1010
from .homework import Homework
1111
from .model import BaseModel
1212

13+
# 全局变量用于收集课程问题信息
14+
_course_issues = {
15+
'missing_documents': [],
16+
'missing_document_classes': [],
17+
'missing_homeworks': []
18+
}
19+
20+
def get_course_issues():
21+
"""获取课程问题汇总"""
22+
return _course_issues.copy()
23+
24+
def clear_course_issues():
25+
"""清空课程问题记录"""
26+
global _course_issues
27+
_course_issues = {
28+
'missing_documents': [],
29+
'missing_document_classes': [],
30+
'missing_homeworks': []
31+
}
32+
1333

1434
class Course(BaseModel):
1535
client: Client = Field(exclude=True)
@@ -23,32 +43,55 @@ class Course(BaseModel):
2343

2444
@property
2545
def document_classes(self) -> Sequence[DocumentClass]:
26-
return [
27-
DocumentClass(client=self.client, **result)
28-
for result in self.client.get_with_token(
46+
# 异常处理,记录问题
47+
try:
48+
response = self.client.get_with_token(
2949
url=url.make_url(path="/b/wlxt/kj/wlkc_kjflb/student/pageList"),
3050
params={"wlkcid": self.id},
31-
).json()["object"]["rows"]
32-
]
51+
)
52+
json_data = response.json()["object"]["rows"]
53+
54+
return [
55+
DocumentClass(client=self.client, **result)
56+
for result in response.json()["object"]["rows"]
57+
]
58+
59+
except Exception as e:
60+
_course_issues['missing_document_classes'].append({
61+
'course': self.name,
62+
'course_id': self.id,
63+
'reason': f'异常: {str(e)}'
64+
})
65+
return []
3366

3467
@property
3568
def documents(self) -> Sequence[Document]:
36-
documents: Sequence[Document] = [
37-
Document(client=self.client, **result)
38-
for result in self.client.get_with_token(
69+
try:
70+
response = self.client.get_with_token(
3971
url=url.make_url(
4072
path="/b/wlxt/kj/wlkc_kjxxb/student/kjxxbByWlkcidAndSizeForStudent"
4173
),
4274
params={"wlkcid": self.id, "size": MAX_SIZE},
43-
).json()["object"]
44-
]
45-
documents.sort(key=lambda document: document.title)
46-
documents.sort(key=lambda document: document.upload_time)
47-
return documents
75+
)
76+
documents: Sequence[Document] = [
77+
Document(client=self.client, **result)
78+
for result in response.json()["object"]
79+
]
80+
documents.sort(key=lambda document: document.title)
81+
documents.sort(key=lambda document: document.upload_time)
82+
return documents
83+
84+
except Exception as e:
85+
_course_issues['missing_documents'].append({
86+
'course': self.name,
87+
'course_id': self.id,
88+
'reason': f'异常: {str(e)}'
89+
})
90+
return []
4891

4992
@property
5093
def homeworks(self) -> Sequence[Homework]:
51-
return [
94+
all_homeworks = [
5295
*self._homeworks_at_url(
5396
url=url.make_url(
5497
path="/b/wlxt/kczy/zy/student/index/zyListWj",
@@ -68,11 +111,42 @@ def homeworks(self) -> Sequence[Homework]:
68111
),
69112
),
70113
]
114+
115+
# 如果所有作业API都没有返回数据,记录该课程
116+
if not all_homeworks:
117+
_course_issues['missing_homeworks'].append({
118+
'course': self.name,
119+
'course_id': self.id,
120+
'reason': '该课程无作业'
121+
})
122+
123+
return all_homeworks
71124

72125
def _homeworks_at_url(self, url: str) -> Sequence[Homework]:
73-
resp: Response = self.client.get_with_token(url=url)
74-
json: dict[str, Any] = resp.json()
75-
results: Sequence[dict[str, Any]] = json["object"]["aaData"] or []
76-
return [
77-
Homework.from_json(client=self.client, json=result) for result in results
78-
]
126+
try:
127+
resp: Response = self.client.get_with_token(url=url)
128+
129+
# 检查响应状态
130+
if resp.status_code != 200:
131+
# 作业API失败时,只在所有作业API都失败时才记录
132+
return []
133+
134+
# 尝试解析JSON
135+
json_data: dict[str, Any] = resp.json()
136+
if json_data is None:
137+
return []
138+
139+
# 检查数据结构
140+
if "object" not in json_data:
141+
return []
142+
143+
if "aaData" not in json_data["object"]:
144+
return []
145+
146+
results: Sequence[dict[str, Any]] = json_data["object"]["aaData"] or []
147+
return [
148+
Homework.from_json(client=self.client, json=result) for result in results
149+
]
150+
151+
except Exception as e:
152+
return []

thu_learn_downloader/client/learn.py

Lines changed: 60 additions & 25 deletions
Original file line numberDiff line numberDiff line change
@@ -18,32 +18,67 @@ class Learn:
1818

1919
def __init__(self, language: Language = Language.ENGLISH, *args, **kwargs) -> None:
2020
self.client = Client(language, *args, **kwargs)
21+
22+
"""使用浏览器进行统一用户登录"""
23+
def login(self) -> None:
24+
from ..login.browser import login_with_browser
25+
26+
try:
27+
cookies = login_with_browser()
28+
29+
# 将cookies设置到client中
30+
for name, value in cookies.items():
31+
self.client.cookies[name] = value
32+
33+
# 验证登录是否成功,尝试访问课程页面
34+
response = self.client.get(url=url.make_url(path="/f/wlxt/index/course/student/"))
35+
if response.status_code == 200:
36+
print("浏览器登录成功!")
37+
else:
38+
raise Exception("登录验证失败,请重试")
39+
40+
except Exception as e:
41+
print(f"浏览器登录失败: {e}")
2142

22-
def login(self, username: str, password: str) -> None:
23-
response: Response = self.client.get(url=url.make_url())
24-
soup: BeautifulSoup = BeautifulSoup(
25-
markup=response.text, features="html.parser"
26-
)
27-
login_form: Tag = cast(Tag, soup.select_one(selector="#loginForm"))
28-
action: str = cast(str, login_form["action"])
29-
response: Response = self.client.post(
30-
url=action, data={"i_user": username, "i_pass": password, "atOnce": True}
31-
)
32-
soup: BeautifulSoup = BeautifulSoup(
33-
markup=response.text, features="html.parser"
34-
)
35-
a: Tag = cast(Tag, soup.select_one(selector="a"))
36-
href: str = cast(str, a["href"])
37-
parse_result: ParseResult = urllib.parse.urlparse(url=href)
38-
query: dict[str, list[str]] = urllib.parse.parse_qs(qs=parse_result.query)
39-
status, ticket = query["status"][0], query["ticket"][0]
40-
self.client.get(url=href)
41-
self.client.get(
42-
url=url.make_url(path="/b/j_spring_security_thauth_roaming_entry"),
43-
params={"ticket": ticket},
44-
)
45-
self.client.get(url=url.make_url(path="/f/wlxt/index/course/student/"))
46-
assert status == "SUCCESS"
43+
# def login(self, username: str = None, password: str = None) -> None:
44+
# """
45+
# 登录方法
46+
# 如果提供了用户名和密码,使用传统登录方式(保留兼容性)
47+
# 否则使用浏览器登录方式
48+
# """
49+
# if username and password:
50+
# # 使用传统的用户名密码登录方式
51+
# self._login_with_credentials(username, password)
52+
# else:
53+
# # 使用新的浏览器登录方式
54+
# self.login_with_browser()
55+
56+
# def _login_with_credentials(self, username: str, password: str) -> None:
57+
# """传统的用户名密码登录方式(保留作为备用)"""
58+
# response: Response = self.client.get(url=url.make_url())
59+
# soup: BeautifulSoup = BeautifulSoup(
60+
# markup=response.text, features="html.parser"
61+
# )
62+
# login_form: Tag = cast(Tag, soup.select_one(selector="#loginForm"))
63+
# action: str = cast(str, login_form["action"])
64+
# response: Response = self.client.post(
65+
# url=action, data={"i_user": username, "i_pass": password, "atOnce": True}
66+
# )
67+
# soup: BeautifulSoup = BeautifulSoup(
68+
# markup=response.text, features="html.parser"
69+
# )
70+
# a: Tag = cast(Tag, soup.select_one(selector="a"))
71+
# href: str = cast(str, a["href"])
72+
# parse_result: ParseResult = urllib.parse.urlparse(url=href)
73+
# query: dict[str, list[str]] = urllib.parse.parse_qs(qs=parse_result.query)
74+
# status, ticket = query["status"][0], query["ticket"][0]
75+
# self.client.get(url=href)
76+
# self.client.get(
77+
# url=url.make_url(path="/b/j_spring_security_thauth_roaming_entry"),
78+
# params={"ticket": ticket},
79+
# )
80+
# self.client.get(url=url.make_url(path="/f/wlxt/index/course/student/"))
81+
# assert status == "SUCCESS"
4782

4883
@functools.cached_property
4984
def semesters(self) -> Sequence[Semester]:

thu_learn_downloader/download/downloader.py

Lines changed: 15 additions & 28 deletions
Original file line numberDiff line numberDiff line change
@@ -46,6 +46,7 @@ class Downloader:
4646
semesters_task_id: TaskID
4747
courses_task_id: TaskID
4848
documents_task_id: TaskID
49+
homeworks_task_id: TaskID
4950

5051
def __init__(
5152
self,
@@ -75,6 +76,7 @@ def __init__(
7576
self.semesters_task_id = self.progress_prepare.add_task(description="Semesters")
7677
self.courses_task_id = self.progress_prepare.add_task(description="Courses")
7778
self.documents_task_id = self.progress_prepare.add_task(description="Documents")
79+
self.homeworks_task_id = self.progress_prepare.add_task(description="Homeworks")
7880
self.live = Live(
7981
Group(
8082
Panel(self.progress_download, height=jobs + 2),
@@ -187,13 +189,10 @@ def sync_semesters(self, semesters: Sequence[Semester]) -> None:
187189
if semester.id in self.selector.semesters
188190
]
189191
self.progress_prepare.reset(task_id=self.semesters_task_id)
190-
for semester in self.progress_prepare.track(
191-
sequence=semesters,
192-
total=len(semesters),
193-
task_id=self.semesters_task_id,
194-
description="Semesters",
195-
):
192+
self.progress_prepare.update(task_id=self.semesters_task_id, total=len(semesters))
193+
for semester in semesters:
196194
self.sync_semester(semester=semester)
195+
self.progress_prepare.advance(task_id=self.semesters_task_id, advance=1)
197196

198197
def sync_semester(self, semester: Semester) -> None:
199198
self.sync_courses(semester=semester, courses=semester.courses)
@@ -204,13 +203,10 @@ def sync_courses(self, semester: Semester, courses: Sequence[Course]) -> None:
204203
course for course in courses if course.id in self.selector.courses
205204
]
206205
self.progress_prepare.reset(task_id=self.courses_task_id)
207-
for course in self.progress_prepare.track(
208-
sequence=courses,
209-
total=len(courses),
210-
task_id=self.courses_task_id,
211-
description="Courses",
212-
):
206+
self.progress_prepare.update(task_id=self.courses_task_id, total=len(courses))
207+
for course in courses:
213208
self.sync_course(semester=semester, course=course)
209+
self.progress_prepare.advance(task_id=self.courses_task_id, advance=1)
214210

215211
def sync_course(self, semester: Semester, course: Course) -> None:
216212
if self.selector.document:
@@ -236,22 +232,16 @@ def sync_documents(
236232
document_class.id: document_class for document_class in document_classes
237233
}
238234
self.progress_prepare.reset(task_id=self.documents_task_id)
239-
for index, document in enumerate(
240-
self.progress_prepare.track(
241-
sequence=documents,
242-
total=len(documents),
243-
task_id=self.documents_task_id,
244-
description="Documents",
245-
),
246-
start=1,
247-
):
235+
self.progress_prepare.update(task_id=self.documents_task_id, total=len(documents))
236+
for index, document in enumerate(documents, start=1):
248237
self.sync_document(
249238
semester=semester,
250239
course=course,
251240
document_class=document_class_map[document.class_id],
252241
document=document,
253242
index=index,
254243
)
244+
self.progress_prepare.advance(task_id=self.documents_task_id, advance=1)
255245

256246
def sync_document(
257247
self,
@@ -287,14 +277,11 @@ def sync_document(
287277
def sync_homeworks(
288278
self, semester: Semester, course: Course, homeworks: Sequence[Homework]
289279
) -> None:
290-
self.progress_prepare.reset(task_id=self.documents_task_id)
291-
for homework in self.progress_prepare.track(
292-
sequence=homeworks,
293-
total=len(homeworks),
294-
task_id=self.documents_task_id,
295-
description="Homeworks",
296-
):
280+
self.progress_prepare.reset(task_id=self.homeworks_task_id)
281+
self.progress_prepare.update(task_id=self.homeworks_task_id, total=len(homeworks))
282+
for homework in homeworks:
297283
self.sync_homework(semester=semester, course=course, homework=homework)
284+
self.progress_prepare.advance(task_id=self.homeworks_task_id, advance=1)
298285

299286
def sync_homework(
300287
self, semester: Semester, course: Course, homework: Homework

0 commit comments

Comments
 (0)