|
| 1 | +import asyncio |
| 2 | +import time |
| 3 | +from pathlib import Path |
| 4 | + |
| 5 | +from codeboxapi import CodeBox |
| 6 | + |
| 7 | + |
| 8 | +async def train_model(codebox: CodeBox, data_split: int) -> dict: |
| 9 | + """Train a model on a subset of data.""" |
| 10 | + |
| 11 | + file = Path("examples/assets/advertising.csv") |
| 12 | + assert file.exists(), "Dataset file does not exist" |
| 13 | + |
| 14 | + # Upload dataset |
| 15 | + await codebox.aupload(file.name, file.read_bytes()) |
| 16 | + |
| 17 | + # Install required packages |
| 18 | + await codebox.ainstall("pandas") |
| 19 | + await codebox.ainstall("scikit-learn") |
| 20 | + |
| 21 | + # Training code with different data splits |
| 22 | + code = f""" |
| 23 | +import pandas as pd |
| 24 | +from sklearn.model_selection import train_test_split |
| 25 | +from sklearn.linear_model import LinearRegression |
| 26 | +from sklearn.metrics import mean_squared_error, r2_score |
| 27 | +
|
| 28 | +# Load and prepare data |
| 29 | +data = pd.read_csv('advertising.csv') |
| 30 | +X = data[['TV', 'Radio', 'Newspaper']] |
| 31 | +y = data['Sales'] |
| 32 | +
|
| 33 | +# Split with different random states for different data subsets |
| 34 | +X_train, X_test, y_train, y_test = train_test_split( |
| 35 | + X, y, test_size=0.3, random_state={data_split} |
| 36 | +) |
| 37 | +
|
| 38 | +# Train model |
| 39 | +model = LinearRegression() |
| 40 | +model.fit(X_train, y_train) |
| 41 | +
|
| 42 | +# Evaluate |
| 43 | +y_pred = model.predict(X_test) |
| 44 | +mse = mean_squared_error(y_test, y_pred) |
| 45 | +r2 = r2_score(y_test, y_pred) |
| 46 | +
|
| 47 | +print(f"Split {data_split}:") |
| 48 | +print(f"MSE: {{mse:.4f}}") |
| 49 | +print(f"R2: {{r2:.4f}}") |
| 50 | +print(f"Coefficients: {{model.coef_.tolist()}}") |
| 51 | +""" |
| 52 | + result = await codebox.aexec(code) |
| 53 | + return {"split": data_split, "output": result.text, "errors": result.errors} |
| 54 | + |
| 55 | + |
| 56 | +async def main(): |
| 57 | + # Create multiple Docker instances |
| 58 | + num_parallel = 4 |
| 59 | + codeboxes = [CodeBox(api_key="docker") for _ in range(num_parallel)] |
| 60 | + |
| 61 | + # Create tasks for different data splits |
| 62 | + tasks = [] |
| 63 | + for i, codebox in enumerate(codeboxes): |
| 64 | + task = asyncio.create_task(train_model(codebox, i)) |
| 65 | + tasks.append(task) |
| 66 | + |
| 67 | + # Execute and time the parallel processing |
| 68 | + start_time = time.perf_counter() |
| 69 | + results = await asyncio.gather(*tasks) |
| 70 | + end_time = time.perf_counter() |
| 71 | + |
| 72 | + # Print results |
| 73 | + print(f"\nParallel execution completed in {end_time - start_time:.2f} seconds\n") |
| 74 | + for result in results: |
| 75 | + if not result["errors"]: |
| 76 | + print(f"Results for {result['split']}:") |
| 77 | + print(result["output"]) |
| 78 | + print("-" * 50) |
| 79 | + else: |
| 80 | + print(f"Error in split {result['split']}:", result["errors"]) |
| 81 | + |
| 82 | + |
| 83 | +if __name__ == "__main__": |
| 84 | + asyncio.run(main()) |
0 commit comments