44from asgiref .sync import async_to_sync
55from langrocks .client import WebBrowser
66from langrocks .common .models .web_browser import WebBrowserCommand , WebBrowserCommandType
7- from pydantic import Field
7+ from pydantic import BaseModel , Field
88
99from llmstack .apps .schemas import OutputTemplate
10+ from llmstack .common .utils .text_extraction_service import PromptlyTextExtractionService
1011from llmstack .processors .providers .api_processor_interface import (
1112 ApiProcessorInterface ,
1213 ApiProcessorSchema ,
@@ -30,6 +31,36 @@ class ProfileActivityInput(ApiProcessorSchema):
3031 )
3132
3233
34+ text_extraction_service = PromptlyTextExtractionService ()
35+
36+
37+ class LinkedInProfile (BaseModel ):
38+ about : Optional [str ] = Field (
39+ description = "About section" ,
40+ default = None ,
41+ )
42+ experience : Optional [str ] = Field (
43+ description = "Experience section" ,
44+ default = None ,
45+ )
46+ education : Optional [str ] = Field (
47+ description = "Education section" ,
48+ default = None ,
49+ )
50+ skills : Optional [str ] = Field (
51+ description = "Skills section" ,
52+ default = None ,
53+ )
54+ interests : Optional [str ] = Field (
55+ description = "Interests section" ,
56+ default = None ,
57+ )
58+ screenshot : Optional [str ] = Field (
59+ description = "Screenshot of the profile page" ,
60+ default = None ,
61+ )
62+
63+
3364class ProfileActivityOutput (ApiProcessorSchema ):
3465 posts : List [str ] = Field (
3566 description = "Posts and reposts from the profile" ,
@@ -43,6 +74,10 @@ class ProfileActivityOutput(ApiProcessorSchema):
4374 description = "Reactions to the content" ,
4475 default = [],
4576 )
77+ profile : Optional [LinkedInProfile ] = Field (
78+ description = "Profile details" ,
79+ default = None ,
80+ )
4681 profile_url : str = Field (
4782 description = "The profile URL that was used" ,
4883 default = "" ,
@@ -58,24 +93,56 @@ class ProfileActivityConfiguration(ApiProcessorSchema):
5893 description = "LinkedIn login session connection to use" ,
5994 json_schema_extra = {"advanced_parameter" : False , "widget" : "connection" },
6095 )
61- n_posts : int = Field (
62- description = "Number of posts to get" ,
63- default = 5 ,
64- ge = 1 ,
65- le = 100 ,
96+ get_profile_screenshot : bool = Field (
97+ description = "Whether to get a screenshot of the profile" ,
98+ default = False ,
6699 )
67- n_comments : int = Field (
68- description = "Number of comments to get" ,
69- default = 5 ,
70- ge = 1 ,
71- le = 100 ,
100+ get_posts : bool = Field (
101+ description = "Whether to get recent posts" ,
102+ default = True ,
72103 )
73- n_reactions : int = Field (
74- description = "Number of reactions to get" ,
75- default = 5 ,
76- ge = 1 ,
77- le = 100 ,
104+ get_comments : bool = Field (
105+ description = "Whether to get recent comments" ,
106+ default = True ,
107+ )
108+ get_reactions : bool = Field (
109+ description = "Whether to get recent reactions" ,
110+ default = True ,
111+ )
112+
113+
114+ def get_user_profile_details (profile_url , web_browser ):
115+ profile_data = {}
116+
117+ profile_url = profile_url .rstrip ("/" )
118+ browser_response = web_browser .run_commands (
119+ [
120+ WebBrowserCommand (
121+ command_type = WebBrowserCommandType .GOTO ,
122+ data = profile_url ,
123+ ),
124+ WebBrowserCommand (
125+ command_type = WebBrowserCommandType .WAIT ,
126+ data = "5000" ,
127+ ),
128+ ]
78129 )
130+ profile_data ["screenshot" ] = browser_response .screenshot
131+ page_html = browser_response .html
132+ sections = _query_selector_all (page_html , "div#profile-content main section" )
133+ for section in sections :
134+ card_element = section .select_one (".pv-profile-card__anchor" )
135+ if card_element :
136+ id = card_element .attrs .get ("id" )
137+ if id in ["about" , "education" , "experience" , "skills" , "interests" ]:
138+ # Remove all aria-hidden="true" elements in the section
139+ for aria_hidden in section .select ("[aria-hidden=true]" ):
140+ aria_hidden .decompose ()
141+ extraction_result = text_extraction_service .extract_from_bytes (
142+ section .encode (), mime_type = "text/html" , filename = "file.html"
143+ )
144+ profile_data [id ] = extraction_result .text
145+ return profile_data
79146
80147
81148def get_user_recent_posts (profile_url , web_browser ):
@@ -96,7 +163,10 @@ def get_user_recent_posts(profile_url, web_browser):
96163 )
97164 page_html = browser_response .html
98165 selectors = _query_selector_all (page_html , "div.feed-shared-update-v2" )
99- text = [selector .text .strip ().rstrip () for selector in selectors ]
166+ text = [
167+ text_extraction_service .extract_from_bytes (selector .encode (), mime_type = "text/html" , filename = "file.html" ).text
168+ for selector in selectors
169+ ]
100170
101171 return text
102172
@@ -119,7 +189,10 @@ def get_user_recent_comments(profile_url, web_browser):
119189 )
120190 page_html = browser_response .html
121191 selectors = _query_selector_all (page_html , "div.feed-shared-update-v2" )
122- text = [selector .text .strip ().rstrip () for selector in selectors ]
192+ text = [
193+ text_extraction_service .extract_from_bytes (selector .encode (), mime_type = "text/html" , filename = "file.html" ).text
194+ for selector in selectors
195+ ]
123196
124197 return text
125198
@@ -141,7 +214,10 @@ def get_user_recent_reactions(profile_url, web_browser):
141214 )
142215 page_html = browser_response .html
143216 selectors = _query_selector_all (page_html , "div.feed-shared-update-v2" )
144- text = [selector .text .strip ().rstrip () for selector in selectors ]
217+ text = [
218+ text_extraction_service .extract_from_bytes (selector .encode (), mime_type = "text/html" , filename = "file.html" ).text
219+ for selector in selectors
220+ ]
145221
146222 return text
147223
@@ -204,27 +280,43 @@ def provider_slug() -> str:
204280 @classmethod
205281 def get_output_template (cls ) -> Optional [OutputTemplate ]:
206282 return OutputTemplate (
207- markdown = """## Posts
283+ markdown = """Profile URL: {{profile_url}}
284+ {% if profile %}
285+ {{profile.about}}
286+
287+ {{profile.experience}}
288+
289+ {{profile.education}}
290+
291+ {{profile.skills}}
292+
293+ {{profile.interests}}
294+ {% endif %}
295+ {% if posts.size > 0 %}
296+ ## Posts
208297
209298{% for post in posts %}
210299{{post}}
211300
212301{% endfor %}
302+ {% endif %}
213303
304+ {% if comments.size > 0 %}
214305## Comments
215306
216307{% for comment in comments %}
217308{{comment}}
218309
219310{% endfor %}
220-
311+ {% endif %}
312+ {% if reactions.size > 0 %}
221313## Reactions
222314
223315{% for reaction in reactions %}
224316{{reaction}}
225317
226318{% endfor %}
227-
319+ {% endif %}
228320{% if error %}
229321{{error}}
230322{% endif %}""" ,
@@ -243,7 +335,7 @@ def process(self) -> dict:
243335 with WebBrowser (
244336 f"{ settings .RUNNER_HOST } :{ settings .RUNNER_PORT } " ,
245337 interactive = False ,
246- capture_screenshot = False ,
338+ capture_screenshot = self . _config . get_profile_screenshot ,
247339 html = True ,
248340 session_data = (
249341 self ._env ["connections" ][self ._config .connection_id ]["configuration" ]["_storage_state" ]
@@ -261,17 +353,47 @@ def process(self) -> dict:
261353 )
262354
263355 if user_profile :
264- user_recent_posts = get_user_recent_posts (self ._input .profile_url , web_browser )
265- user_recent_comments = get_user_recent_comments (self ._input .profile_url , web_browser )
266- user_recent_reactions = get_user_recent_reactions (self ._input .profile_url , web_browser )
267-
268- async_to_sync (output_stream .write )(
269- ProfileActivityOutput (
270- posts = user_recent_posts [: self ._config .n_posts ],
271- comments = user_recent_comments [: self ._config .n_comments ],
272- reactions = user_recent_reactions [: self ._config .n_reactions ],
273- profile_url = self ._input .profile_url ,
274- )
275- )
356+ async_to_sync (output_stream .write )(ProfileActivityOutput (profile_url = user_profile ))
357+ profile_details = get_user_profile_details (user_profile , web_browser )
358+ if profile_details :
359+ async_to_sync (output_stream .write )(
360+ ProfileActivityOutput (profile = LinkedInProfile (** profile_details ))
361+ )
362+ if self ._config .get_posts :
363+ user_recent_posts = get_user_recent_posts (self ._input .profile_url , web_browser )
364+ if user_recent_posts :
365+ async_to_sync (output_stream .write )(ProfileActivityOutput (posts = user_recent_posts ))
366+ else :
367+ async_to_sync (output_stream .write )(
368+ ProfileActivityOutput (
369+ error = f"Could not find any posts for the profile { self ._input .profile_url } " ,
370+ )
371+ )
372+ if self ._config .get_comments :
373+ user_recent_comments = get_user_recent_comments (self ._input .profile_url , web_browser )
374+ if user_recent_comments :
375+ async_to_sync (output_stream .write )(ProfileActivityOutput (comments = user_recent_comments ))
376+ else :
377+ async_to_sync (output_stream .write )(
378+ ProfileActivityOutput (
379+ error = f"Could not find any comments for the profile { self ._input .profile_url } " ,
380+ )
381+ )
382+ if self ._config .get_reactions :
383+ user_recent_reactions = get_user_recent_reactions (self ._input .profile_url , web_browser )
384+ if user_recent_reactions :
385+ async_to_sync (output_stream .write )(ProfileActivityOutput (reactions = user_recent_reactions ))
386+ else :
387+ async_to_sync (output_stream .write )(
388+ ProfileActivityOutput (
389+ error = f"Could not find any reactions for the profile { self ._input .profile_url } " ,
390+ )
391+ )
276392
393+ else :
394+ async_to_sync (output_stream .write )(
395+ ProfileActivityOutput (
396+ error = "Could not find the profile. Please provide a valid profile URL or search term or check your connection." ,
397+ )
398+ )
277399 return output_stream .finalize ()
0 commit comments