abcsys · daiwaid · Jan 19, 2025 · Jan 27, 2025
diff --git a/Makefile b/Makefile
@@ -8,7 +8,7 @@ install:
 
 .PHONY: run map
 map: 
-	python examples/map.py
+	python examples/tasks/map/map.py
 run: map
 
 .PHONY: benchmark_match, benchmark_mapper, test

diff --git a/examples/kafka/s2_producer.py b/examples/kafka/s2_producer.py
@@ -23,6 +23,7 @@
 
 # User data to be sent
 user_data = {"name": "John Doe", "age": 28}
+print("User:", user_data)
 
 # Produce message
 producer.produce(topic='user-info', key=str(user_data['name']), value=user_data)

diff --git a/examples/kafka/s4_producer_v2.py b/examples/kafka/s4_producer_v2.py
@@ -18,6 +18,7 @@
 
 # Updated user data to be sent with the new schema
 user_data = {"name": "Jane Doe", "age": 27, "email": "janedoe@example.com"}
+print("User:", user_data)
 
 # Produce message
 producer.produce(topic='user-info', key=str(user_data['name']), value=user_data)

diff --git a/examples/tasks/assemble/assemble.py b/examples/tasks/assemble/assemble.py
@@ -1,8 +1,43 @@
+import pandas as pd
+
 import llmint
+from llmint.assemble.pandas import assemble, construct
 
 
 def main():
-    llmint.assemble()
+    source_schema = '''
+    {
+        "fields": [
+            {"name": "Fname", "type": "string"},
+            {"name": "Lname", "type": "string"},
+            {"name": "Age", "type": "int"},
+            {"name": "Email", "type": ["null", "string"], "default": null}
+        ]
+    }
+    '''
+    target_schema = '''
+    {
+        "fields": [
+            {"name": "name", "type": "string"},
+            {"name": "age", "type": "int"},
+            {"name": "email", "type": ["null", "string"], "default": null}
+        ]
+    }
+    '''
+
+
+    source_df = pd.DataFrame([{"Fname": "Josh", "Lname": "Doe", "Age": 31, "Email": "joshdoe@example.com"}])
+    dest_df = pd.DataFrame([{"name": "Jane Doe", "age": 27, "email": "janedoe@example.com"}])
+    print("Concat the source dataframe to the dest dataframe:")
+    print("Source:", source_df, sep="\n")
+    print("Dest:", dest_df, sep="\n")
+
+    mappings = llmint.map(source_schema, target_schema)
+    assembly = assemble(mappings)
+    output = construct(source_df, assembly)
+
+    combined_df = pd.concat([dest_df, output], axis=0)
+    print("\nCombined:", combined_df, sep="\n")
 
 
 if __name__ == "__main__":

diff --git a/llmint/assemble/pandas/__init__.py b/llmint/assemble/pandas/__init__.py
@@ -0,0 +1 @@
+from llmint.assemble.pandas.function import assemble, construct
diff --git a/llmint/assemble/pandas/function.py b/llmint/assemble/pandas/function.py
@@ -0,0 +1,39 @@
+import pandas as pd
+from typing import List, Callable
+
+from llmint.assemble.pandas.transform import (
+    add, copy, default, missing, apply, scale, shift
+)
+from llmint.map.function import Map
+
+
+def assemble(mappings: list[Map]):
+    output = []
+
+    for mapping in mappings:
+        match mapping.transformation.split(' ')[0]:
+            case 'ADD':
+                output.append(add(mapping))
+            case 'COPY':
+                output.append(copy(mapping))
+            case 'DEFAULT':
+                output.append(default(mapping))
+            case 'MISSING':
+                output.append(missing(mapping))
+            case 'APPLY':
+                output.append(apply(mapping))
+            case 'SCALE':
+                output.append(scale(mapping))
+            case 'SHIFT':
+                output.append(shift(mapping))   
+
+    return output
+
+
+def construct(df: pd.DataFrame, assembly: List[Callable[[pd.DataFrame], pd.Series]]):
+    df_output = []
+
+    for func in assembly:
+        df_output.append(func(df))
+
+    return pd.concat(df_output, axis=1)
diff --git a/llmint/assemble/pandas/transform/__init__.py b/llmint/assemble/pandas/transform/__init__.py
@@ -0,0 +1,8 @@
+from llmint.assemble.pandas.transform.field.add import func as add
+from llmint.assemble.pandas.transform.field.copy import func as copy
+from llmint.assemble.pandas.transform.field.default import func as default
+from llmint.assemble.pandas.transform.field.missing import func as missing
+
+from llmint.assemble.pandas.transform.value.apply import func as apply
+from llmint.assemble.pandas.transform.value.scale import func as scale
+from llmint.assemble.pandas.transform.value.shift import func as shift
diff --git a/llmint/assemble/pandas/transform/field/__init__.py b/llmint/assemble/pandas/transform/field/__init__.py
diff --git a/llmint/assemble/pandas/transform/field/add.py b/llmint/assemble/pandas/transform/field/add.py
@@ -0,0 +1,10 @@
+import re
+from pandas import Series
+
+from llmint.map.function import Map
+
+
+def func(mapping: Map):
+    col_type = re.search(r'TYPE (\w+)', mapping.transformation).group(1)
+
+    return lambda df: Series([], name=mapping.target_field, dtype=col_type)
diff --git a/llmint/assemble/pandas/transform/field/copy.py b/llmint/assemble/pandas/transform/field/copy.py
@@ -0,0 +1,7 @@
+from pandas import Series
+
+from llmint.map.function import Map
+
+
+def func(mapping: Map):
+    return lambda df: Series(df[mapping.source_field], name=mapping.target_field)
diff --git a/llmint/assemble/pandas/transform/field/default.py b/llmint/assemble/pandas/transform/field/default.py
@@ -0,0 +1,10 @@
+import re
+from pandas import Series
+
+from llmint.map.function import Map
+
+
+def func(mapping: Map):
+    default_val = re.search(r'DEFAULT TO (.*)', mapping.transformation).group(1)
+
+    return lambda df: Series([default_val] * len(df), name=mapping.target_field)
diff --git a/llmint/assemble/pandas/transform/field/missing.py b/llmint/assemble/pandas/transform/field/missing.py
@@ -0,0 +1,5 @@
+from llmint.map.function import Map
+
+
+def func(mapping: Map):
+    return lambda df: print(f"WARNING: {mapping.target_field} field cannot be automatically converted.")
diff --git a/llmint/assemble/pandas/transform/value/__init__.py b/llmint/assemble/pandas/transform/value/__init__.py
diff --git a/llmint/assemble/pandas/transform/value/apply.py b/llmint/assemble/pandas/transform/value/apply.py
@@ -0,0 +1,19 @@
+import re
+from pandas import Series, DataFrame
+
+from llmint.map.function import Map
+
+
+def func(mapping: Map):
+    apply_func = re.search(r'APPLY (.*)', mapping.transformation).group(1)
+
+    def apply(df: DataFrame):
+        # assign all columns to their own variables
+        for col in df.columns:
+            exec(f'{col.replace(" ", "_")} = df[col]', locals(), globals())
+
+        exec(f'_output = {apply_func}', locals(), globals())
+
+        return Series(_output, name=mapping.target_field)
+
+    return apply
diff --git a/llmint/assemble/pandas/transform/value/scale.py b/llmint/assemble/pandas/transform/value/scale.py
@@ -0,0 +1,12 @@
+import re
+
+from llmint.map.function import Map
+
+
+def func(mapping: Map):
+    try:
+        scale = float(re.search(r'SCALE BY (\d*.\d*)', mapping.transformation).group(1))
+    except ValueError:
+        return lambda df: df[mapping.source_field].copy()
+
+    return lambda df: df[mapping.source_field] * scale
diff --git a/llmint/assemble/pandas/transform/value/shift.py b/llmint/assemble/pandas/transform/value/shift.py
@@ -0,0 +1,12 @@
+import re
+
+from llmint.map.function import Map
+
+
+def func(mapping: Map):
+    try:
+        shift = float(re.search(r'SHIFT BY (\d*.\d*)', mapping.transformation).group(1))
+    except ValueError:
+        return lambda df: df[mapping.source_field].copy()
+
+    return lambda df: df[mapping.source_field] + shift
diff --git a/llmint/core/eval.py b/llmint/core/eval.py
@@ -1,3 +1,6 @@
+from llmint.map.function import Map
+
+
 class pcolors:
     RIGHT = '\033[92m'
     WRONG = '\033[91m'
@@ -43,13 +46,11 @@ def accuracy(output: list, test_example: list):
         f1 = 0
     return precision, recall, f1
 
-def print_mappings(mappings: dict, include_reasoning=True):
-    for name, response in mappings.items():
-        mapping, reasoning = response
-        if include_reasoning:
-
-            print(pcolors.RIGHT + mapping + pcolors.ENDC + '\n',
-                  reasoning, flush=True)
-        else:
-            print(pcolors.RIGHT + mapping + pcolors.ENDC,
-                  flush=True)
+def print_mappings(mappings: list[Map], include_reasoning=True):
+    for mapping in mappings:
+            if include_reasoning:
+                print(pcolors.RIGHT + mapping.__dict__ + pcolors.ENDC + '\n',
+                    mapping.reasoning, flush=True)
+            else:
+                print(pcolors.RIGHT + mapping.__dict__ + pcolors.ENDC,
+                    flush=True)
diff --git a/llmint/map/function.py b/llmint/map/function.py
@@ -1,8 +1,18 @@
+from pydantic import BaseModel
+
 from llmint.core import model
 from llmint.map import prompt, parameter
 
+
+class Map(BaseModel):
+    source_field: str | None
+    target_field: str
+    transformation: str
+    reasoning: str | None
+
+
 def map(source_schema, target_schema):
-    mappings = model.call(
+    output = model.call(
         prompt=[
             {"role": "system", "content": prompt.system},
             {"role": "user", "content": prompt.user.format(
@@ -15,5 +25,12 @@ def map(source_schema, target_schema):
         temperature=parameter.temperature,
         seed=parameter.seed,
         max_model_call=1,  # only one model call
-    )["tool_outputs"][0] # take the first tool output
+    )["tool_outputs"]
+
+    # process the mappings
+    mappings = []
+    for mapping in output:
+        for _, mapping in mapping.items():
+            mappings.append(mapping)
+
     return mappings
diff --git a/llmint/map/parameter.py b/llmint/map/parameter.py
@@ -7,16 +7,18 @@
     "llmint.map.match",
     # field transformation
     "llmint.map.transform.field.add",
-    "llmint.map.transform.field.cast",
+    # "llmint.map.transform.field.cast",
     "llmint.map.transform.field.copy",
     "llmint.map.transform.field.default",
-    "llmint.map.transform.field.delete",
-    "llmint.map.transform.field.rename",
+    # "llmint.map.transform.field.delete",
+    # "llmint.map.transform.field.rename",
     "llmint.map.transform.field.missing",
     # value transformation
     "llmint.map.transform.value.apply",
     # "llmint.map.transform.value.gen",
-    "llmint.map.transform.value.link",
+    # "llmint.map.transform.value.link",
     "llmint.map.transform.value.scale",
     "llmint.map.transform.value.shift",
 ]
+
+reasoning = False
diff --git a/llmint/map/prompt.py b/llmint/map/prompt.py
@@ -69,3 +69,6 @@
 
 user = "Source Schema: ' + {source_schema} + " \
        "'\nTarget Schema: ' + {target_schema}"
+
+"""Reasoning prompt"""
+reasoning_prompt = "In-depth reasoning as to why you chose this function"
diff --git a/llmint/map/transform/field/add.py b/llmint/map/transform/field/add.py
@@ -1,31 +1,37 @@
+from libem.core.util import create_json_schema
+
+from llmint.map.function import Map
+from llmint.map.parameter import reasoning
+from llmint.map.prompt import reasoning_prompt
+
+
 name = "ADD"
+description = "Add an optional target field"
+properties = {
+    "target_field": (str, "Optional field in the target schema"),
+    "field_type": (str, "The type of the field to be added"),
+}
+if reasoning:
+    properties["reasoning"] = (str, reasoning_prompt)
+
 schema = {
     "type": "function",
     "function": {
         "name": name,
-        "description": "Add an optional target field",
+        "description": description,
         "parameters": {
             "type": "object",
-            "properties": {
-                "target_field": {
-                    "type": "string",
-                    "description": "Optional field in the target schema",
-                },
-                "field_type": {
-                    "type": "string",
-                    "description": "The type of the field to be added",
-                },
-                "reasoning": {
-                    "type": "string",
-                    "description": "In-depth reasoning as to why you chose this function",
-                },
-            },
-            "required": ["target_field", "field_type", "reasoning"],
-        },
+            "properties": create_json_schema(
+                **properties
+            )["properties"],
+            "required": list(properties.keys()),
+        }
     }
 }
 
 
-def func(target_field, field_type, reasoning):
-    return (f'{{from: None, to: {target_field}, '
-            f'transformation: ADD {target_field} TYPE {field_type}}}', reasoning)
+def func(target_field, field_type, reasoning=None):
+    return Map(source_field=None, 
+               target_field=target_field, 
+               transformation=f'ADD {target_field} TYPE {field_type}', 
+               reasoning=reasoning)
diff --git a/llmint/map/transform/field/cast.py b/llmint/map/transform/field/cast.py
@@ -1,3 +1,6 @@
+from llmint.map.function import Map
+
+
 name = "CAST"
 schema = {
     "type": "function",
@@ -35,5 +38,7 @@
 
 
 def func(source_field, target_field, source_type, target_type, reasoning):
-    return (f'{{from: {source_field}, to: {target_field}, '
-            f'transformation: CAST {source_field} FROM {source_type} TO {target_type}}}', reasoning)
+    return Map(source_field=source_field,
+               target_field=target_field,
+               transformation=f'CAST FROM {source_type} TO {target_type}',
+               reasoning=reasoning)
Original file line number	Diff line number	Diff line change
		@@ -0,0 +1 @@
		from llmint.assemble.pandas.function import assemble, construct