EleutherAI
diff --git a/‎.github/workflows/build.yml‎
Lines changed: 39 additions & 0 deletions b/‎.github/workflows/build.yml‎
Lines changed: 39 additions & 0 deletions
diff --git a/‎bergson/collection.py‎
Lines changed: 3 additions & 1 deletion b/‎bergson/collection.py‎
Lines changed: 3 additions & 1 deletion
diff --git a/‎bergson/data.py‎
Lines changed: 25 additions & 8 deletions b/‎bergson/data.py‎
Lines changed: 25 additions & 8 deletions
diff --git a/‎bergson/distributed.py‎
Lines changed: 24 additions & 7 deletions b/‎bergson/distributed.py‎
Lines changed: 24 additions & 7 deletions
diff --git a/‎bergson/gradients.py‎
Lines changed: 18 additions & 8 deletions b/‎bergson/gradients.py‎
Lines changed: 18 additions & 8 deletions
diff --git a/‎bergson/hessians/attribute.py‎
Lines changed: 25 additions & 8 deletions b/‎bergson/hessians/attribute.py‎
Lines changed: 25 additions & 8 deletions
@@ -0,0 +1,39 @@
+name: build
+
+on:
+  push:
+    branches:
+      - ekfac
+  pull_request:
+    branches:
+      - ekfac
+jobs:
+  build:
+    runs-on: ubuntu-latest
+    steps:
+      - uses: actions/checkout@v4
+      - uses: actions/setup-python@v5
+        with:
+          python-version: "3.10"
+      - name: Install dependencies
+        run: |
+          python -m pip install --upgrade pip
+          pip install -e ".[dev,faiss]"
+      # TODO: Proper test infrastructure for tests/ekfac_tests
+      # - name: Run tests
+      #   run: pytest
+      # TODO: run pyright on whole codebase
+      - name: Type Checking bergson/hessians
+        uses: jakebailey/pyright-action@v1
+        with:
+          version: 1.1.406
+          working-directory: bergson/hessians
+      - name: Type Checking tests/ekfac_tests
+        uses: jakebailey/pyright-action@v1
+        with:
+          version: 1.1.406
+          working-directory: tests/ekfac_tests
+      - name: build
+        run: pip wheel --no-deps -w dist .
+env:
+  HF_HUB_DOWNLOAD_TIMEOUT: 100
@@ -72,7 +72,9 @@ def callback(name: str, g: torch.Tensor):
     grad_sizes = {name: math.prod(s) for name, s in collector.shapes().items()}
 
     # Allocate structured space ahead of time for the gradients
-    grad_buffer = create_index(cfg.run_path, num_grads=len(data), grad_sizes=grad_sizes, dtype=np.float16)
+    grad_buffer = create_index(
+        cfg.run_path, num_grads=len(data), grad_sizes=grad_sizes, dtype=np.float16
+    )
 
     per_doc_losses = torch.full(
         (len(data),),
 
@@ -16,6 +16,8 @@
 
 from .utils import assert_type
 
+Precision = Literal["bf16", "fp16", "fp32", "int4", "int8"]
+
 
 @dataclass
 class DataConfig:
@@ -48,7 +50,7 @@ class IndexConfig:
     fsdp: bool = False
     """Whether to use Fully Sharded Data Parallel (FSDP) for collecing gradients."""
 
-    precision: Literal["bf16", "fp16", "fp32", "int4", "int8"] = "bf16"
+    precision: Precision = "bf16"
     """Precision to use for the model parameters."""
 
     projection_dim: int = 16
@@ -99,7 +101,9 @@ def ceildiv(a: int, b: int) -> int:
     return -(-a // b)  # Equivalent to math.ceil(a / b) but faster for integers
 
 
-def allocate_batches(doc_lengths: list[int], N: int, world_size: Optional[int] = None) -> list[list[int]]:
+def allocate_batches(
+    doc_lengths: list[int], N: int, world_size: Optional[int] = None
+) -> list[list[int]]:
     """
     Allocate documents into batches that are then distributed evenly across
     a fixed number of workers.
@@ -183,7 +187,9 @@ def allocate_batches(doc_lengths: list[int], N: int, world_size: Optional[int] =
         while len(batches) < world_size:
             big = batches.pop(0)  # take the current largest
             if len(big) == 1:  # cannot split a singleton
-                raise RuntimeError("Not enough documents to give each worker at least one batch.")
+                raise RuntimeError(
+                    "Not enough documents to give each worker at least one batch."
+                )
             batches.append([big.pop()])  # move one doc into new batch
             batches.append(big)  # put the remainder back
             # preserve cost constraint automatically
@@ -205,7 +211,9 @@ def allocate_batches(doc_lengths: list[int], N: int, world_size: Optional[int] =
         i += 1
 
     assert len(batches) == target_batches
-    assert all(max(doc_lengths[i] for i in batch) * len(batch) <= N for batch in batches)
+    assert all(
+        max(doc_lengths[i] for i in batch) * len(batch) <= N for batch in batches
+    )
 
     # ---------------------------------------------------------------------
     # 4) Round-robin assignment to workers
@@ -219,7 +227,9 @@ def allocate_batches(doc_lengths: list[int], N: int, world_size: Optional[int] =
     return allocation[rank]
 
 
-def create_index(root: str, num_grads: int, grad_sizes: dict[str, int], dtype: DTypeLike) -> np.memmap:
+def create_index(
+    root: str, num_grads: int, grad_sizes: dict[str, int], dtype: DTypeLike
+) -> np.memmap:
     """Create a memory-mapped file for storing structured gradients
     and persist metadata."""
     grad_path = os.path.join(root, "gradients.bin")
@@ -310,7 +320,9 @@ def load_shard(dir: str) -> Dataset:
         if concatenate_gradients:
             unstructured_data = structured_to_unstructured(mmap)
             flat = pa.array(unstructured_data.reshape(-1))
-            col_arrow = pa.FixedSizeListArray.from_arrays(flat, unstructured_data.shape[1])
+            col_arrow = pa.FixedSizeListArray.from_arrays(
+                flat, unstructured_data.shape[1]
+            )
 
             ds = ds.add_column("gradients", col_arrow, new_fingerprint="gradients")
         # Add a column for each module's gradient vectors
@@ -374,7 +386,9 @@ def tokenize(batch: dict, *, args: DataConfig, tokenizer):
                 {"role": "user", "content": assert_type(str, prompt)},
                 {"role": "assistant", "content": assert_type(str, resp)},
             ]
-            for prompt, resp in zip(batch[args.prompt_column], batch[args.completion_column])
+            for prompt, resp in zip(
+                batch[args.prompt_column], batch[args.completion_column]
+            )
         ]
     elif args.conversation_column:
         # We're dealing with a conversation dataset
@@ -421,4 +435,7 @@ def tokenize(batch: dict, *, args: DataConfig, tokenizer):
 def unflatten(x: torch.Tensor, shapes: dict[str, Sequence[int]], dim: int = -1):
     """Unflatten a tensor `x` into a dictionary of tensors with specified shapes."""
     numels = [math.prod(shape) for shape in shapes.values()]
-    return {name: x.unflatten(dim, shape) for (name, shape), x in zip(shapes.items(), x.split(numels, dim=dim))}
+    return {
+        name: x.unflatten(dim, shape)
+        for (name, shape), x in zip(shapes.items(), x.split(numels, dim=dim))
+    }
@@ -64,22 +64,30 @@ def setup_data_pipeline(cfg: IndexConfig) -> Dataset | IterableDataset:
             ds = load_dataset(data_str, split="train")
 
             if isinstance(ds, DatasetDict) or isinstance(ds, IterableDatasetDict):
-                raise NotImplementedError("DatasetDicts and IterableDatasetDicts are not supported.")
+                raise NotImplementedError(
+                    "DatasetDicts and IterableDatasetDicts are not supported."
+                )
         except ValueError as e:
             # Automatically use load_from_disk if appropriate
             if "load_from_disk" in str(e):
                 ds = Dataset.load_from_disk(data_str, keep_in_memory=False)
             else:
                 raise e
 
-    tokenizer = AutoTokenizer.from_pretrained(cfg.model, model_max_length=cfg.token_batch_size)
+    tokenizer = AutoTokenizer.from_pretrained(
+        cfg.model, model_max_length=cfg.token_batch_size
+    )
 
-    ds = ds.map(tokenize, batched=True, fn_kwargs=dict(args=cfg.data, tokenizer=tokenizer))
+    ds = ds.map(
+        tokenize, batched=True, fn_kwargs=dict(args=cfg.data, tokenizer=tokenizer)
+    )
 
     return ds
 
 
-def setup_model_and_peft(cfg: IndexConfig, rank: int, dtype: torch.dtype) -> tuple[AutoModelForCausalLM, set | None]:
+def setup_model_and_peft(
+    cfg: IndexConfig, rank: int, dtype: torch.dtype
+) -> tuple[AutoModelForCausalLM, set | None]:
     """Handle model loading, quantization, FSDP, and PEFT detection"""
 
     torch.manual_seed(42)
@@ -141,7 +149,9 @@ def setup_model_and_peft(cfg: IndexConfig, rank: int, dtype: torch.dtype) -> tup
                     model.get_submodule(processed_name)
                     target_modules.add(processed_name)
                 except AttributeError:
-                    print(f"Adapter parameter '{processed_name}' not found in the model.")
+                    print(
+                        f"Adapter parameter '{processed_name}' not found in the model."
+                    )
 
     # Configure gradients
     model.requires_grad_(False)
@@ -223,7 +233,11 @@ def worker_wrapper(
                 case "fp32":
                     dtype = torch.float32
                 case "int4" | "int8":
-                    dtype = torch.bfloat16 if torch.cuda.is_bf16_supported() else torch.float16
+                    dtype = (
+                        torch.bfloat16
+                        if torch.cuda.is_bf16_supported()
+                        else torch.float16
+                    )
                 case other:
                     raise ValueError(f"Unsupported precision: {other}")
 
@@ -305,7 +319,10 @@ def distributed_computing(
             ctx = start_processes(
                 "build",
                 worker_wrapper,
-                args={i: (i, world_size, cfg, ds, worker_fn, setup_model, setup_processor) for i in range(world_size)},
+                args={
+                    i: (i, world_size, cfg, ds, worker_fn, setup_model, setup_processor)
+                    for i in range(world_size)
+                },
                 envs={
                     i: {
                         "LOCAL_RANK": str(i),
 
@@ -162,7 +162,9 @@ def to_adafactor(self) -> AdafactorNormalizer:
         and the factored second moments.
         """
         # We assume avg_sq is a square matrix of shape [O, I]
-        assert self.avg_sq.ndim == 2, f"Expected 2D tensor for avg_sq, got {self.avg_sq.ndim}D"
+        assert (
+            self.avg_sq.ndim == 2
+        ), f"Expected 2D tensor for avg_sq, got {self.avg_sq.ndim}D"
 
         # Compute row and column means
         return AdafactorNormalizer(
@@ -213,9 +215,6 @@ def save(self, path: str):
             json.dump(cfg, f, indent=2)
 
 
-
-
-
 @dataclass
 class GradientCollector(ContextDecorator):
     """
@@ -346,7 +345,12 @@ def _save_input(self, module: nn.Module, inp: tuple, _):
         if p is not None and not isinstance(norm, AdamNormalizer):
             i = module.in_features
 
-            x = x @ self.projection(name=name, m=p, n=i, side="right", dtype=x.dtype, device=x.device).T
+            x = (
+                x
+                @ self.projection(
+                    name=name, m=p, n=i, side="right", dtype=x.dtype, device=x.device
+                ).T
+            )
 
         module._inputs = x
 
@@ -387,14 +391,20 @@ def _process_grad(self, module: nn.Module, _, grad_out):
 
             # Project the gradients to the lower-dimensional space
             if p is not None:
-                A = self.projection(name=name, m=p, n=o, side="left", dtype=G.dtype, device=G.device)
-                B = self.projection(name=name, m=p, n=i, side="right", dtype=G.dtype, device=G.device)
+                A = self.projection(
+                    name=name, m=p, n=o, side="left", dtype=G.dtype, device=G.device
+                )
+                B = self.projection(
+                    name=name, m=p, n=i, side="right", dtype=G.dtype, device=G.device
+                )
                 P = A @ P @ B.T  # [N, p, q]
 
         # Both Adafactor and no normalizer, we can project G first
         else:
             if p is not None:
-                A = self.projection(name=name, m=p, n=o, side="left", dtype=G.dtype, device=G.device)
+                A = self.projection(
+                    name=name, m=p, n=o, side="left", dtype=G.dtype, device=G.device
+                )
                 G = G @ A.T  # [N, S, p]
 
             P = G.mT @ I  # [N, O/p, S] @ [N, S, I/q] → [N, O/p, I/q]
 
@@ -13,14 +13,20 @@
 # ## 1. Load index for query and train data
 
 parser = argparse.ArgumentParser(description="Process normalization flag.")
-parser.add_argument("--normalize", action="store_true", help="Gradients will be unit normalized.")
+parser.add_argument(
+    "--normalize", action="store_true", help="Gradients will be unit normalized."
+)
 args = parser.parse_args()
 
 device = "cuda:1"
 
 # %%
-base_path = "/mnt/ssd-1/gpaulo/emergent-misalignment/emergent-misalignment-eleuther/data/"
-index_dataset = load_dataset("json", data_files=f"{base_path}merged-medical-reformatted.jsonl")["train"]
+base_path = (
+    "/mnt/ssd-1/gpaulo/emergent-misalignment/emergent-misalignment-eleuther/data/"
+)
+index_dataset = load_dataset(
+    "json", data_files=f"{base_path}merged-medical-reformatted.jsonl"
+)["train"]
 index_path = "/mnt/ssd-1/gpaulo/emergent-misalignment/qwen14_merged_medical_proj16/merged_medical_no_normalizer"
 queries_path = "/mnt/ssd-1/louis/emergent_misalignment/test_query_ekfac"
 
@@ -37,17 +43,25 @@
 normalize = args.normalize
 
 attribution_dict = {}
-output_path = "/mnt/ssd-1/louis/emergent_misalignment/test_query_ekfac_attribution_no_normalizer"
+output_path = (
+    "/mnt/ssd-1/louis/emergent_misalignment/test_query_ekfac_attribution_no_normalizer"
+)
 if normalize:
     output_path += "_unit_norm"
 os.makedirs(output_path, exist_ok=True)
 
 for name in tqdm(list(names)):
     index_tensor = torch.from_numpy(index[name]).to(device=device, dtype=torch.float32)
-    queries_tensor = torch.from_numpy(queries[name]).to(device=device, dtype=torch.float32)
+    queries_tensor = torch.from_numpy(queries[name]).to(
+        device=device, dtype=torch.float32
+    )
     if normalize:
-        index_tensor = index_tensor / (torch.norm(index_tensor, dim=1, keepdim=True) + 1e-10)
-        queries_tensor = queries_tensor / (torch.norm(queries_tensor, dim=1, keepdim=True) + 1e-10)
+        index_tensor = index_tensor / (
+            torch.norm(index_tensor, dim=1, keepdim=True) + 1e-10
+        )
+        queries_tensor = queries_tensor / (
+            torch.norm(queries_tensor, dim=1, keepdim=True) + 1e-10
+        )
     # Compute result on GPU
     result_tensor = index_tensor @ queries_tensor.T
 
@@ -56,7 +70,10 @@
 
     # Create memory-mapped file with .bin extension
     mmap_file = np.memmap(
-        os.path.join(output_path, f"{name}_attribution.npy"), dtype=np.float32, mode="w+", shape=result_shape
+        os.path.join(output_path, f"{name}_attribution.npy"),
+        dtype=np.float32,
+        mode="w+",
+        shape=result_shape,
     )
 
     # Copy GPU result directly to memmap