Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
32 changes: 16 additions & 16 deletions .github/workflows/sast.yml
Original file line number Diff line number Diff line change
Expand Up @@ -21,19 +21,19 @@ jobs:
runs-on: ubuntu-latest
timeout-minutes: 10
steps:
- uses: actions/checkout@v3
- name: Test
run: |
echo UID=$(id -u) >> .env
docker run --rm --user=$(id -u) \
-v "${PWD}:/code" \
-w /code \
-e MAVEN_OPTS=" -ntp " \
-e RUN_OWASP_DEPENDENCY_CHECK=false \
-e RUN_SPOTBUGS_CHECK=false \
-e RUN_SPOTLESS_CHECK=false \
-e RUN_SPOTLESS_APPLY=true \
-e HOME=/tmp \
-e USER=nobody \
-e BANDIT_CONFIG_FILE=/code/.bandit.yaml \
ghcr.io/par-tec/super-sast:latest
- uses: actions/checkout@v3
- name: Test
run: |
echo UID="$(id -u)" >> .env
docker run --rm --user="$(id -u)" \
-v "${PWD}:/code" \
-w /code \
-e MAVEN_OPTS=" -ntp " \
-e RUN_OWASP_DEPENDENCY_CHECK=false \
-e RUN_SPOTBUGS_CHECK=false \
-e RUN_SPOTLESS_CHECK=false \
-e RUN_SPOTLESS_APPLY=true \
-e HOME=/tmp \
-e USER=nobody \
-e BANDIT_CONFIG_FILE=/code/.bandit.yaml \
ghcr.io/par-tec/super-sast:latest
1 change: 1 addition & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -56,6 +56,7 @@ Use the `EscoCV` and the `Ner` classes to extract skills from text:
from esco.cv import EscoCV
from esco import LocalDB
from esco.ner import Ner
import nltk

# Initialize the vector index (slow) on disk.
# This can be reused later.
Expand Down
1 change: 1 addition & 0 deletions docker-compose.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,7 @@ services:
working_dir: /code
volumes:
- .:/code
cpus: "6.0"
entrypoint: [sleep, infinity]
test:
<<: *base
Expand Down
43 changes: 40 additions & 3 deletions esco/cv.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,16 @@
"""
Extracts and analyzes skills from CV text using ESCO classifications.

Features:
- Entity counting and skill extraction via NER and neural search
- Integration with ESCO database
- Sentence-level skill analysis

Note: Avoid personal data in CV text to prevent NER confusion.

Usage: Import EscoCV class to process CV text and extract skills.
"""

import logging

import esco
Expand Down Expand Up @@ -30,7 +43,8 @@ class EscoCV:
A CV skill extractor.

The text should not contain personal data,
since this may confuse the NER model (e.g., the text "address: Java street" may be recognized as a skill).
since this may confuse the NER model
(e.g., the text "address: Java street" may be recognized as a skill).
"""

def __init__(self, ner, text=None, doc=None) -> None:
Expand All @@ -48,6 +62,17 @@ def __init__(self, ner, text=None, doc=None) -> None:
self.sentences = []

def entities(self):
"""
@return a dict containing entities and their count.
The dict has two keys:
- 'entities': a list of dicts, each representing an entity with:
- 'start': start character index
- 'end': end character index
- 'label': entity label
- 'text': entity text
- 'id': entity ID
- 'count': total number of entities found
"""
return {
"entities": [
{
Expand Down Expand Up @@ -93,7 +118,7 @@ def ner_skills(self, force=False) -> dict:
for skill in skills:
ret[skill["uri"]] = {"label": skill["label"], "count": e["count"]}
else:
log.debug(f"Ignoring other labels: {e['label']}")
log.debug("Ignoring other labels: %s", e["label"])
self._ner_skills = ret
return ret

Expand Down Expand Up @@ -137,6 +162,18 @@ def skills_by_sentence(self, force=False):
return self.sentences

def skills(self, force=False):
"""
@param force: if True, force recalculation even if results are cached.
@return a dict of skills, where:
- Key: skill URI
- Value: dict containing:
- 'label': skill label
- 'count': number of occurrences
- 'score': relevance score (if available)
Note:
- Skills with labels shorter than 5 characters are skipped.
- For duplicate skills, count is incremented and score is maximized.
"""
if self._all_skills and not force:
return self._all_skills
ner_skills = {} | self.ner_skills(force=force)
Expand All @@ -146,7 +183,7 @@ def skills(self, force=False):

if len(skill["label"]) < 5:
log.debug(
f"Skipping {skill['label']}. Too short for neural search."
"Skipping %s. Too short for neural search.", skill["label"]
)
continue
if uri not in ner_skills:
Expand Down
163 changes: 163 additions & 0 deletions model/esco-chunks.ipynb
Original file line number Diff line number Diff line change
@@ -0,0 +1,163 @@
{
"cells": [
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"from esco import LocalDB\n",
"db = LocalDB()\n",
"skills_labels = [\n",
" \"collaborate with engineers\",\n",
" \"deploy cloud resource\",\n",
" \"design cloud architecture\",\n",
" \"design cloud networks\",\n",
" \"plan migration to cloud\",\n",
" \"automate cloud tasks\",\n",
" \"coordinate engineering teams\",\n",
" \"design database in the cloud\",\n",
" \"design for organisational complexity\",\n",
" \"develop with cloud services\",\n",
" \"do cloud refactoring\",\n",
" ]\n",
"skills = db.skills[db.skills.label.str.lower().isin(skills_labels)]\n",
"labels = [l for labels in skills.allLabel for l in labels]\n",
"\n",
"import spacy\n",
"nlp = spacy.load(\"en_core_web_trf\")"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"def find_root(doc):\n",
" for prefix in (\"\", \"to \"):\n",
" doc = nlp(prefix + doc.text)\n",
" for token in doc:\n",
" if token.dep_ == \"ROOT\" and token.pos_ == \"VERB\":\n",
" return token\n",
" return None\n",
"\n",
"def find_obj(token):\n",
" for child in token.children:\n",
" if child.dep_ == \"prep\":\n",
" return find_obj(child)\n",
" if child.dep_ in (\"dobj\", \"pobj\", \"nsubj\"):\n",
" return child\n",
" return None\n",
"\n",
"def find_compound(token):\n",
" compounds = []\n",
" if token:\n",
" compounds.append(token.lemma_)\n",
" for child in token.children:\n",
" if child.dep_ == \"compound\":\n",
" compounds.append(child.lemma_)\n",
" return \" \".join(reversed(compounds)) # Combine compounds into a single string\n",
"comp_list = []\n",
"docs = nlp.pipe(labels)\n",
"for doc in docs:\n",
" verb = None\n",
" for token in doc:\n",
" verb = find_root(doc)\n",
" if not verb:\n",
" print(\"****missing verb\", doc.text)\n",
" continue\n",
" obj = find_obj(verb)\n",
" compound_str = find_compound(obj)\n",
" comp_list.append(compound_str)\n",
" print(verb.lemma_, compound_str, f\"-{doc.text}-\")\n"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"\n",
"# Funzione per estrarre competenze potenziali\n",
"def extract_potential_skills(text):\n",
" doc = nlp(text)\n",
" chunks = []\n",
" for chunk in doc.noun_chunks:\n",
" if chunk.root.dep_ in ['dobj', 'pobj'] and chunk.root.head.pos_ == 'VERB':\n",
" verb = chunk.root.head\n",
" start = verb.i\n",
" end = chunk.end\n",
" chunks.append(doc[start:end])\n",
" return chunks\n",
"\n",
"# Funzione per valutare la rilevanza di un chunk come competenza\n",
"def evaluate_chunk_as_skill(chunk, known_skills):\n",
" chunk_text = chunk.text.lower()\n",
" for skill in known_skills:\n",
" if skill in chunk_text:\n",
" return True\n",
" return False\n",
"\n",
"text = \"During my internship, I wrote code to integrate cloud services and cloud networks.\"\n",
"\n",
"potential_skills = extract_potential_skills(text)\n",
"for chunk in potential_skills:\n",
" if evaluate_chunk_as_skill(chunk, comp_list):\n",
" print(f\"Potential skill identified: {chunk}\")"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"```\n",
"Potential skill identified: wrote code\n",
"Potential skill identified: integrate cloud services\n",
"```\n"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"test_text = [\n",
" \"I am an experienced IT professional with a strong background in cloud computing and software development. \",\n",
" \"I specialize in planning and executing cloud migrations, designing cloud architectures, and automating cloud tasks. \",\n",
" \"My skills include creating cloud network, managing cloud resource, and developing cloud applications. \",\n",
" \"I am also adept to designing cloud environments for large organizations, ensuring they are scalable and efficient.\"\n",
"]\n",
"for text in test_text:\n",
" potential_skills = extract_potential_skills(text)\n",
" for chunk in potential_skills:\n",
" if evaluate_chunk_as_skill(chunk, comp_list):\n",
" print(f\"Potential skill identified: {chunk}\")"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"```\n",
"Potential skill identified: executing cloud migrations\n",
"Potential skill identified: designing cloud architectures\n",
"Potential skill identified: automating cloud tasks\n",
"Potential skill identified: creating cloud network\n",
"Potential skill identified: managing cloud resource\n",
"Potential skill identified: developing cloud applications\n",
"Potential skill identified: designing cloud environments\n",
"```"
]
}
],
"metadata": {
"language_info": {
"name": "python"
}
},
"nbformat": 4,
"nbformat_minor": 2
}
Loading
Loading