Skip to content

Commit 028474e

Browse files
committed
Update Linked profile activity
1 parent 5f755a8 commit 028474e

File tree

1 file changed

+157
-35
lines changed

1 file changed

+157
-35
lines changed

llmstack/processors/providers/linkedin/profile_activity.py

Lines changed: 157 additions & 35 deletions
Original file line numberDiff line numberDiff line change
@@ -4,9 +4,10 @@
44
from asgiref.sync import async_to_sync
55
from langrocks.client import WebBrowser
66
from langrocks.common.models.web_browser import WebBrowserCommand, WebBrowserCommandType
7-
from pydantic import Field
7+
from pydantic import BaseModel, Field
88

99
from llmstack.apps.schemas import OutputTemplate
10+
from llmstack.common.utils.text_extraction_service import PromptlyTextExtractionService
1011
from llmstack.processors.providers.api_processor_interface import (
1112
ApiProcessorInterface,
1213
ApiProcessorSchema,
@@ -30,6 +31,36 @@ class ProfileActivityInput(ApiProcessorSchema):
3031
)
3132

3233

34+
text_extraction_service = PromptlyTextExtractionService()
35+
36+
37+
class LinkedInProfile(BaseModel):
38+
about: Optional[str] = Field(
39+
description="About section",
40+
default=None,
41+
)
42+
experience: Optional[str] = Field(
43+
description="Experience section",
44+
default=None,
45+
)
46+
education: Optional[str] = Field(
47+
description="Education section",
48+
default=None,
49+
)
50+
skills: Optional[str] = Field(
51+
description="Skills section",
52+
default=None,
53+
)
54+
interests: Optional[str] = Field(
55+
description="Interests section",
56+
default=None,
57+
)
58+
screenshot: Optional[str] = Field(
59+
description="Screenshot of the profile page",
60+
default=None,
61+
)
62+
63+
3364
class ProfileActivityOutput(ApiProcessorSchema):
3465
posts: List[str] = Field(
3566
description="Posts and reposts from the profile",
@@ -43,6 +74,10 @@ class ProfileActivityOutput(ApiProcessorSchema):
4374
description="Reactions to the content",
4475
default=[],
4576
)
77+
profile: Optional[LinkedInProfile] = Field(
78+
description="Profile details",
79+
default=None,
80+
)
4681
profile_url: str = Field(
4782
description="The profile URL that was used",
4883
default="",
@@ -58,24 +93,56 @@ class ProfileActivityConfiguration(ApiProcessorSchema):
5893
description="LinkedIn login session connection to use",
5994
json_schema_extra={"advanced_parameter": False, "widget": "connection"},
6095
)
61-
n_posts: int = Field(
62-
description="Number of posts to get",
63-
default=5,
64-
ge=1,
65-
le=100,
96+
get_profile_screenshot: bool = Field(
97+
description="Whether to get a screenshot of the profile",
98+
default=False,
6699
)
67-
n_comments: int = Field(
68-
description="Number of comments to get",
69-
default=5,
70-
ge=1,
71-
le=100,
100+
get_posts: bool = Field(
101+
description="Whether to get recent posts",
102+
default=True,
72103
)
73-
n_reactions: int = Field(
74-
description="Number of reactions to get",
75-
default=5,
76-
ge=1,
77-
le=100,
104+
get_comments: bool = Field(
105+
description="Whether to get recent comments",
106+
default=True,
107+
)
108+
get_reactions: bool = Field(
109+
description="Whether to get recent reactions",
110+
default=True,
111+
)
112+
113+
114+
def get_user_profile_details(profile_url, web_browser):
115+
profile_data = {}
116+
117+
profile_url = profile_url.rstrip("/")
118+
browser_response = web_browser.run_commands(
119+
[
120+
WebBrowserCommand(
121+
command_type=WebBrowserCommandType.GOTO,
122+
data=profile_url,
123+
),
124+
WebBrowserCommand(
125+
command_type=WebBrowserCommandType.WAIT,
126+
data="5000",
127+
),
128+
]
78129
)
130+
profile_data["screenshot"] = browser_response.screenshot
131+
page_html = browser_response.html
132+
sections = _query_selector_all(page_html, "div#profile-content main section")
133+
for section in sections:
134+
card_element = section.select_one(".pv-profile-card__anchor")
135+
if card_element:
136+
id = card_element.attrs.get("id")
137+
if id in ["about", "education", "experience", "skills", "interests"]:
138+
# Remove all aria-hidden="true" elements in the section
139+
for aria_hidden in section.select("[aria-hidden=true]"):
140+
aria_hidden.decompose()
141+
extraction_result = text_extraction_service.extract_from_bytes(
142+
section.encode(), mime_type="text/html", filename="file.html"
143+
)
144+
profile_data[id] = extraction_result.text
145+
return profile_data
79146

80147

81148
def get_user_recent_posts(profile_url, web_browser):
@@ -96,7 +163,10 @@ def get_user_recent_posts(profile_url, web_browser):
96163
)
97164
page_html = browser_response.html
98165
selectors = _query_selector_all(page_html, "div.feed-shared-update-v2")
99-
text = [selector.text.strip().rstrip() for selector in selectors]
166+
text = [
167+
text_extraction_service.extract_from_bytes(selector.encode(), mime_type="text/html", filename="file.html").text
168+
for selector in selectors
169+
]
100170

101171
return text
102172

@@ -119,7 +189,10 @@ def get_user_recent_comments(profile_url, web_browser):
119189
)
120190
page_html = browser_response.html
121191
selectors = _query_selector_all(page_html, "div.feed-shared-update-v2")
122-
text = [selector.text.strip().rstrip() for selector in selectors]
192+
text = [
193+
text_extraction_service.extract_from_bytes(selector.encode(), mime_type="text/html", filename="file.html").text
194+
for selector in selectors
195+
]
123196

124197
return text
125198

@@ -141,7 +214,10 @@ def get_user_recent_reactions(profile_url, web_browser):
141214
)
142215
page_html = browser_response.html
143216
selectors = _query_selector_all(page_html, "div.feed-shared-update-v2")
144-
text = [selector.text.strip().rstrip() for selector in selectors]
217+
text = [
218+
text_extraction_service.extract_from_bytes(selector.encode(), mime_type="text/html", filename="file.html").text
219+
for selector in selectors
220+
]
145221

146222
return text
147223

@@ -204,27 +280,43 @@ def provider_slug() -> str:
204280
@classmethod
205281
def get_output_template(cls) -> Optional[OutputTemplate]:
206282
return OutputTemplate(
207-
markdown="""## Posts
283+
markdown="""Profile URL: {{profile_url}}
284+
{% if profile %}
285+
{{profile.about}}
286+
287+
{{profile.experience}}
288+
289+
{{profile.education}}
290+
291+
{{profile.skills}}
292+
293+
{{profile.interests}}
294+
{% endif %}
295+
{% if posts.size > 0 %}
296+
## Posts
208297
209298
{% for post in posts %}
210299
{{post}}
211300
212301
{% endfor %}
302+
{% endif %}
213303
304+
{% if comments.size > 0 %}
214305
## Comments
215306
216307
{% for comment in comments %}
217308
{{comment}}
218309
219310
{% endfor %}
220-
311+
{% endif %}
312+
{% if reactions.size > 0 %}
221313
## Reactions
222314
223315
{% for reaction in reactions %}
224316
{{reaction}}
225317
226318
{% endfor %}
227-
319+
{% endif %}
228320
{% if error %}
229321
{{error}}
230322
{% endif %}""",
@@ -243,7 +335,7 @@ def process(self) -> dict:
243335
with WebBrowser(
244336
f"{settings.RUNNER_HOST}:{settings.RUNNER_PORT}",
245337
interactive=False,
246-
capture_screenshot=False,
338+
capture_screenshot=self._config.get_profile_screenshot,
247339
html=True,
248340
session_data=(
249341
self._env["connections"][self._config.connection_id]["configuration"]["_storage_state"]
@@ -261,17 +353,47 @@ def process(self) -> dict:
261353
)
262354

263355
if user_profile:
264-
user_recent_posts = get_user_recent_posts(self._input.profile_url, web_browser)
265-
user_recent_comments = get_user_recent_comments(self._input.profile_url, web_browser)
266-
user_recent_reactions = get_user_recent_reactions(self._input.profile_url, web_browser)
267-
268-
async_to_sync(output_stream.write)(
269-
ProfileActivityOutput(
270-
posts=user_recent_posts[: self._config.n_posts],
271-
comments=user_recent_comments[: self._config.n_comments],
272-
reactions=user_recent_reactions[: self._config.n_reactions],
273-
profile_url=self._input.profile_url,
274-
)
275-
)
356+
async_to_sync(output_stream.write)(ProfileActivityOutput(profile_url=user_profile))
357+
profile_details = get_user_profile_details(user_profile, web_browser)
358+
if profile_details:
359+
async_to_sync(output_stream.write)(
360+
ProfileActivityOutput(profile=LinkedInProfile(**profile_details))
361+
)
362+
if self._config.get_posts:
363+
user_recent_posts = get_user_recent_posts(self._input.profile_url, web_browser)
364+
if user_recent_posts:
365+
async_to_sync(output_stream.write)(ProfileActivityOutput(posts=user_recent_posts))
366+
else:
367+
async_to_sync(output_stream.write)(
368+
ProfileActivityOutput(
369+
error=f"Could not find any posts for the profile {self._input.profile_url}",
370+
)
371+
)
372+
if self._config.get_comments:
373+
user_recent_comments = get_user_recent_comments(self._input.profile_url, web_browser)
374+
if user_recent_comments:
375+
async_to_sync(output_stream.write)(ProfileActivityOutput(comments=user_recent_comments))
376+
else:
377+
async_to_sync(output_stream.write)(
378+
ProfileActivityOutput(
379+
error=f"Could not find any comments for the profile {self._input.profile_url}",
380+
)
381+
)
382+
if self._config.get_reactions:
383+
user_recent_reactions = get_user_recent_reactions(self._input.profile_url, web_browser)
384+
if user_recent_reactions:
385+
async_to_sync(output_stream.write)(ProfileActivityOutput(reactions=user_recent_reactions))
386+
else:
387+
async_to_sync(output_stream.write)(
388+
ProfileActivityOutput(
389+
error=f"Could not find any reactions for the profile {self._input.profile_url}",
390+
)
391+
)
276392

393+
else:
394+
async_to_sync(output_stream.write)(
395+
ProfileActivityOutput(
396+
error="Could not find the profile. Please provide a valid profile URL or search term or check your connection.",
397+
)
398+
)
277399
return output_stream.finalize()

0 commit comments

Comments
 (0)