Skip to content

Commit 5216d5e

Browse files
committed
Add linked Job datail processor
1 parent 3accea3 commit 5216d5e

File tree

1 file changed

+147
-0
lines changed

1 file changed

+147
-0
lines changed
Lines changed: 147 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,147 @@
1+
import logging
2+
from typing import Optional
3+
4+
from asgiref.sync import async_to_sync
5+
from bs4 import BeautifulSoup
6+
from langrocks.client import WebBrowser
7+
from langrocks.common.models.web_browser import WebBrowserCommand, WebBrowserCommandType
8+
from pydantic import BaseModel, Field
9+
10+
from llmstack.apps.schemas import OutputTemplate
11+
from llmstack.common.utils.text_extraction_service import PromptlyTextExtractionService
12+
from llmstack.processors.providers.api_processor_interface import (
13+
ApiProcessorInterface,
14+
ApiProcessorSchema,
15+
)
16+
from llmstack.processors.providers.promptly.web_browser import BrowserRemoteSessionData
17+
18+
logger = logging.getLogger(__name__)
19+
20+
text_extraction_service = PromptlyTextExtractionService()
21+
22+
23+
class JobDetailsInput(ApiProcessorSchema):
24+
job_url: str = Field(description="The job URL", default="")
25+
26+
27+
class LinkedInJob(BaseModel):
28+
title: Optional[str] = Field(description="The job title", default=None)
29+
description: Optional[str] = Field(description="The job description", default=None)
30+
31+
32+
class JobDetailOutput(ApiProcessorSchema):
33+
job: Optional[LinkedInJob] = Field(description="The job details", default=None)
34+
error: Optional[str] = Field(
35+
default=None,
36+
description="Error message if something went wrong",
37+
)
38+
session: Optional[BrowserRemoteSessionData] = Field(
39+
default=None,
40+
description="Session data from the browser",
41+
)
42+
43+
44+
class JobDetailConfiguration(ApiProcessorSchema):
45+
connection_id: Optional[str] = Field(
46+
default=None,
47+
description="LinkedIn login session connection to use",
48+
json_schema_extra={"advanced_parameter": False, "widget": "connection"},
49+
)
50+
get_page_screenshot: bool = Field(
51+
description="Whether to get a screenshot of the job details page",
52+
default=False,
53+
json_schema_extra={"advanced_parameter": True},
54+
)
55+
stream_video: bool = Field(description="Stream video", default=False)
56+
57+
58+
def get_linkedin_job_detail(job_url, web_browser):
59+
job_detail = {}
60+
browser_response = web_browser.run_commands(
61+
[
62+
WebBrowserCommand(
63+
command_type=WebBrowserCommandType.GOTO,
64+
data=job_url,
65+
),
66+
WebBrowserCommand(
67+
command_type=WebBrowserCommandType.WAIT,
68+
data="2000",
69+
),
70+
]
71+
)
72+
soup = BeautifulSoup(browser_response.html, "html.parser")
73+
title = soup.select("head title")
74+
if title and title[0]:
75+
job_detail["title"] = title[0].text.split("|")[0]
76+
77+
description_div = soup.select("div.description__text")
78+
if description_div and description_div[0]:
79+
# This is from logged out page
80+
job_detail["description"] = text_extraction_service.extract_from_bytes(
81+
description_div[0].encode(), mime_type="text/html", filename="file.html"
82+
).text
83+
84+
jobs_description_content_div = soup.select("div.jobs-description-content__text")
85+
if jobs_description_content_div and jobs_description_content_div[0]:
86+
# This is from logged in page
87+
job_detail["description"] = text_extraction_service.extract_from_bytes(
88+
jobs_description_content_div[0].encode(), mime_type="text/html", filename="file.html"
89+
).text
90+
91+
return job_detail
92+
93+
94+
class JobDetailProcessor(ApiProcessorInterface[JobDetailsInput, JobDetailOutput, JobDetailConfiguration]):
95+
@staticmethod
96+
def name() -> str:
97+
return "Job Detail"
98+
99+
@staticmethod
100+
def slug() -> str:
101+
return "job_detail"
102+
103+
@staticmethod
104+
def description() -> str:
105+
return "Gets job listing details from the URL."
106+
107+
@staticmethod
108+
def provider_slug() -> str:
109+
return "linkedin"
110+
111+
@classmethod
112+
def get_output_template(cls) -> Optional[OutputTemplate]:
113+
return OutputTemplate(
114+
markdown="""<promptly-web-browser-embed wsUrl="{{session.ws_url}}"></promptly-web-browser-embed>""",
115+
jsonpath="$.job",
116+
)
117+
118+
def process(self) -> dict:
119+
from django.conf import settings
120+
121+
with WebBrowser(
122+
f"{settings.RUNNER_HOST}:{settings.RUNNER_PORT}",
123+
interactive=self._config.stream_video,
124+
capture_screenshot=self._config.get_page_screenshot,
125+
html=True,
126+
session_data=(
127+
self._env["connections"][self._config.connection_id]["configuration"]["_storage_state"]
128+
if self._config.connection_id
129+
else ""
130+
),
131+
) as web_browser:
132+
if self._config.stream_video and web_browser.get_wss_url():
133+
async_to_sync(
134+
self._output_stream.write,
135+
)(
136+
JobDetailOutput(
137+
session=BrowserRemoteSessionData(
138+
ws_url=web_browser.get_wss_url(),
139+
),
140+
),
141+
)
142+
job = get_linkedin_job_detail(self._input.job_url, web_browser)
143+
if job:
144+
async_to_sync(self._output_stream.write)(JobDetailOutput(job=LinkedInJob(**job)))
145+
else:
146+
async_to_sync(self._output_stream.write)(JobDetailOutput(error="Failed to get job details"))
147+
return self._output_stream.finalize()

0 commit comments

Comments
 (0)