par-tec · masciotta02 · Jul 2, 2024 · Aug 8, 2024 · Aug 29, 2024 · Sep 5, 2024
diff --git a/.github/workflows/sast.yml b/.github/workflows/sast.yml
@@ -21,19 +21,19 @@ jobs:
     runs-on: ubuntu-latest
     timeout-minutes: 10
     steps:
-    - uses: actions/checkout@v3
-    - name: Test
-      run: |
-        echo UID=$(id -u) >> .env
-        docker run --rm --user=$(id -u) \
-          -v "${PWD}:/code" \
-          -w /code \
-          -e MAVEN_OPTS=" -ntp " \
-          -e RUN_OWASP_DEPENDENCY_CHECK=false \
-          -e RUN_SPOTBUGS_CHECK=false \
-          -e RUN_SPOTLESS_CHECK=false \
-          -e RUN_SPOTLESS_APPLY=true \
-          -e HOME=/tmp \
-          -e USER=nobody \
-          -e BANDIT_CONFIG_FILE=/code/.bandit.yaml \
-          ghcr.io/par-tec/super-sast:latest
+      - uses: actions/checkout@v3
+      - name: Test
+        run: |
+          echo UID="$(id -u)" >> .env
+          docker run --rm --user="$(id -u)" \
+            -v "${PWD}:/code" \
+            -w /code \
+            -e MAVEN_OPTS=" -ntp " \
+            -e RUN_OWASP_DEPENDENCY_CHECK=false \
+            -e RUN_SPOTBUGS_CHECK=false \
+            -e RUN_SPOTLESS_CHECK=false \
+            -e RUN_SPOTLESS_APPLY=true \
+            -e HOME=/tmp \
+            -e USER=nobody \
+            -e BANDIT_CONFIG_FILE=/code/.bandit.yaml \
+            ghcr.io/par-tec/super-sast:latest
diff --git a/README.md b/README.md
@@ -56,6 +56,7 @@ Use the `EscoCV` and the `Ner` classes to extract skills from text:
 from esco.cv import EscoCV
 from esco import LocalDB
 from esco.ner import Ner
+import nltk
 
 # Initialize the vector index (slow) on disk.
 # This can be reused later.

diff --git a/docker-compose.yaml b/docker-compose.yaml
@@ -14,6 +14,7 @@ services:
     working_dir: /code
     volumes:
       - .:/code
+    cpus: "6.0"
     entrypoint: [sleep, infinity]
   test:
     <<: *base

diff --git a/esco/cv.py b/esco/cv.py
@@ -1,3 +1,16 @@
+"""
+Extracts and analyzes skills from CV text using ESCO classifications.
+
+Features:
+- Entity counting and skill extraction via NER and neural search
+- Integration with ESCO database
+- Sentence-level skill analysis
+
+Note: Avoid personal data in CV text to prevent NER confusion.
+
+Usage: Import EscoCV class to process CV text and extract skills.
+"""
+
 import logging
 
 import esco
@@ -30,7 +43,8 @@ class EscoCV:
     A CV skill extractor.
 
     The text should not contain personal data,
-    since this may confuse the NER model (e.g., the text "address: Java street" may be recognized as a skill).
+    since this may confuse the NER model
+    (e.g., the text "address: Java street" may be recognized as a skill).
     """
 
     def __init__(self, ner, text=None, doc=None) -> None:
@@ -48,6 +62,17 @@ def __init__(self, ner, text=None, doc=None) -> None:
         self.sentences = []
 
     def entities(self):
+        """
+        @return a dict containing entities and their count.
+        The dict has two keys:
+        - 'entities': a list of dicts, each representing an entity with:
+            - 'start': start character index
+            - 'end': end character index
+            - 'label': entity label
+            - 'text': entity text
+            - 'id': entity ID
+        - 'count': total number of entities found
+        """
         return {
             "entities": [
                 {
@@ -93,7 +118,7 @@ def ner_skills(self, force=False) -> dict:
                 for skill in skills:
                     ret[skill["uri"]] = {"label": skill["label"], "count": e["count"]}
             else:
-                log.debug(f"Ignoring other labels: {e['label']}")
+                log.debug("Ignoring other labels: %s", e["label"])
         self._ner_skills = ret
         return ret
 
@@ -137,6 +162,18 @@ def skills_by_sentence(self, force=False):
         return self.sentences
 
     def skills(self, force=False):
+        """
+        @param force: if True, force recalculation even if results are cached.
+        @return a dict of skills, where:
+            - Key: skill URI
+            - Value: dict containing:
+                - 'label': skill label
+                - 'count': number of occurrences
+                - 'score': relevance score (if available)
+        Note:
+            - Skills with labels shorter than 5 characters are skipped.
+            - For duplicate skills, count is incremented and score is maximized.
+        """
         if self._all_skills and not force:
             return self._all_skills
         ner_skills = {} | self.ner_skills(force=force)
@@ -146,7 +183,7 @@ def skills(self, force=False):
 
                 if len(skill["label"]) < 5:
                     log.debug(
-                        f"Skipping {skill['label']}. Too short for neural search."
+                        "Skipping %s. Too short for neural search.", skill["label"]
                     )
                     continue
                 if uri not in ner_skills:

diff --git a/model/esco-chunks.ipynb b/model/esco-chunks.ipynb
@@ -0,0 +1,163 @@
+{
+ "cells": [
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from esco import LocalDB\n",
+    "db = LocalDB()\n",
+    "skills_labels = [\n",
+    "        \"collaborate with engineers\",\n",
+    "        \"deploy cloud resource\",\n",
+    "        \"design cloud architecture\",\n",
+    "        \"design cloud networks\",\n",
+    "        \"plan migration to cloud\",\n",
+    "        \"automate cloud tasks\",\n",
+    "        \"coordinate engineering teams\",\n",
+    "        \"design database in the cloud\",\n",
+    "        \"design for organisational complexity\",\n",
+    "        \"develop with cloud services\",\n",
+    "        \"do cloud refactoring\",\n",
+    "    ]\n",
+    "skills = db.skills[db.skills.label.str.lower().isin(skills_labels)]\n",
+    "labels  = [l for labels in skills.allLabel for l in labels]\n",
+    "\n",
+    "import spacy\n",
+    "nlp = spacy.load(\"en_core_web_trf\")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "def find_root(doc):\n",
+    "    for prefix in (\"\", \"to \"):\n",
+    "        doc = nlp(prefix + doc.text)\n",
+    "        for token in doc:\n",
+    "            if token.dep_ == \"ROOT\" and token.pos_ == \"VERB\":\n",
+    "                return token\n",
+    "    return None\n",
+    "\n",
+    "def find_obj(token):\n",
+    "    for child in token.children:\n",
+    "        if child.dep_ == \"prep\":\n",
+    "            return find_obj(child)\n",
+    "        if child.dep_ in (\"dobj\", \"pobj\", \"nsubj\"):\n",
+    "            return child\n",
+    "    return None\n",
+    "\n",
+    "def find_compound(token):\n",
+    "    compounds = []\n",
+    "    if token:\n",
+    "        compounds.append(token.lemma_)\n",
+    "        for child in token.children:\n",
+    "            if child.dep_ == \"compound\":\n",
+    "                compounds.append(child.lemma_)\n",
+    "    return \" \".join(reversed(compounds))  # Combine compounds into a single string\n",
+    "comp_list = []\n",
+    "docs = nlp.pipe(labels)\n",
+    "for doc in docs:\n",
+    "    verb = None\n",
+    "    for token in doc:\n",
+    "        verb = find_root(doc)\n",
+    "    if not verb:\n",
+    "        print(\"****missing verb\", doc.text)\n",
+    "        continue\n",
+    "    obj = find_obj(verb)\n",
+    "    compound_str = find_compound(obj)\n",
+    "    comp_list.append(compound_str)\n",
+    "    print(verb.lemma_, compound_str, f\"-{doc.text}-\")\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "\n",
+    "# Funzione per estrarre competenze potenziali\n",
+    "def extract_potential_skills(text):\n",
+    "    doc = nlp(text)\n",
+    "    chunks = []\n",
+    "    for chunk in doc.noun_chunks:\n",
+    "        if chunk.root.dep_ in ['dobj', 'pobj'] and chunk.root.head.pos_ == 'VERB':\n",
+    "            verb = chunk.root.head\n",
+    "            start = verb.i\n",
+    "            end = chunk.end\n",
+    "            chunks.append(doc[start:end])\n",
+    "    return chunks\n",
+    "\n",
+    "# Funzione per valutare la rilevanza di un chunk come competenza\n",
+    "def evaluate_chunk_as_skill(chunk, known_skills):\n",
+    "    chunk_text = chunk.text.lower()\n",
+    "    for skill in known_skills:\n",
+    "        if skill in chunk_text:\n",
+    "            return True\n",
+    "    return False\n",
+    "\n",
+    "text = \"During my internship, I wrote code to integrate cloud services and cloud networks.\"\n",
+    "\n",
+    "potential_skills = extract_potential_skills(text)\n",
+    "for chunk in potential_skills:\n",
+    "    if evaluate_chunk_as_skill(chunk, comp_list):\n",
+    "        print(f\"Potential skill identified: {chunk}\")"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "```\n",
+    "Potential skill identified: wrote code\n",
+    "Potential skill identified: integrate cloud services\n",
+    "```\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "test_text = [\n",
+    "    \"I am an experienced IT professional with a strong background in cloud computing and software development. \",\n",
+    "    \"I specialize in planning and executing cloud migrations, designing cloud architectures, and automating cloud tasks. \",\n",
+    "    \"My skills include creating cloud network, managing cloud resource, and developing cloud applications. \",\n",
+    "    \"I am also adept to designing cloud environments for large organizations, ensuring they are scalable and efficient.\"\n",
+    "]\n",
+    "for text in test_text:\n",
+    "    potential_skills = extract_potential_skills(text)\n",
+    "    for chunk in potential_skills:\n",
+    "        if evaluate_chunk_as_skill(chunk, comp_list):\n",
+    "            print(f\"Potential skill identified: {chunk}\")"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "```\n",
+    "Potential skill identified: executing cloud migrations\n",
+    "Potential skill identified: designing cloud architectures\n",
+    "Potential skill identified: automating cloud tasks\n",
+    "Potential skill identified: creating cloud network\n",
+    "Potential skill identified: managing cloud resource\n",
+    "Potential skill identified: developing cloud applications\n",
+    "Potential skill identified: designing cloud environments\n",
+    "```"
+   ]
+  }
+ ],
+ "metadata": {
+  "language_info": {
+   "name": "python"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 2
+}