-
Notifications
You must be signed in to change notification settings - Fork 547
Description
Hi everyone,
I am trying to use captum's TCAV for a pre-trained TinyLlama model (text generation model). I have followed the tutorial (https://captum.ai/tutorials/TCAV_NLP) but am facing the following error related to the dimension sizes:
Traceback (most recent call last):
File "/khush194/llama/tcav.py", line 114, in
tcav.interpret(tokens_tensor, experimental_sets)
File "/khush194/llamaEnv/lib/python3.10/site-packages/captum/log/init.py", line 42, in wrapper
return func(*args, **kwargs)
File "/khush194/llamaEnv/lib/python3.10/site-packages/captum/concept/_core/tcav.py", line 662, in interpret
self.compute_cavs(experimental_sets, processes=processes)
File "/khush194/llamaEnv/lib/python3.10/site-packages/captum/concept/_core/tcav.py", line 546, in compute_cavs
train_cav(
File "/khush194/llamaEnv/lib/python3.10/site-packages/captum/concept/_core/tcav.py", line 170, in train_cav
classifier_stats_dict = classifier.train_and_eval(
File "/khush194/llamaEnv/lib/python3.10/site-packages/captum/concept/_utils/classifier.py", line 174, in train_and_eval
torch.cat(inputs), torch.cat(labels), test_split=test_split_ratio
RuntimeError: Sizes of tensors must match except in dimension 0. Expected size 84480 but got size 163328 for tensor number 16 in the list.
I am not sure why this is occurring. The dimensions of the token tensors of each line of the concept CSVs are padded to the same length and should be the same tensor size.
Does anyone have any idea about what I might be doing wrong?
Below is my code:
class CSVTextDataset(Dataset):
def __init__(self, csv_file, text_col):
"""
Args:
csv_file (string): Path to the CSV file.
text_col (string): Name of the column containing the text data.
label_col (string): Name of the column containing the labels.
transform (callable, optional): Optional transform to be applied on a sample.
"""
self.data = pd.read_csv(csv_file, encoding='windows-1252')
self.text_col = text_col
def __len__(self):
return len(self.data)
def __getitem__(self, idx):
text = self.data.loc[idx, self.text_col]
sample = {self.text_col: text}
return sample
pipe = pipeline("text-generation", model="TinyLlama/TinyLlama-1.1B-Chat-v1.0", torch_dtype=torch.bfloat16, device_map="auto")
def get_tensor_from_filename(filename):
ds = CSVTextDataset(csv_file=filename, text_col='text')
print("-------------------------")
for i in range(len(ds)):
text = ds[i]["text"]
# Tokenize the string and convert to indices
indexed_tokens = pipe.tokenizer.encode(text, padding='max_length', max_length=15)
# Convert the list of indices to a tensor
tokens_tensor = torch.tensor(indexed_tokens)
yield tokens_tensor
def print_concept_sample(concept_iter):
count = 0
item = next(concept_iter)
while item is not None:
try:
item = next(concept_iter)
except StopIteration:
break
# Concept 1
dataset = CustomIterableDataset(get_tensor_from_filename, r"./Book1.csv")
concept_iter = dataset_to_dataloader(dataset, batch_size=1)
x = Concept(id=0, name="depression", data_iter=concept_iter)
print_concept_sample(iter(x.data_iter))
# Concept 2
dataset2 = CustomIterableDataset(get_tensor_from_filename, r"./Book2.csv")
concept_iter2 = dataset_to_dataloader(dataset2, batch_size=1)
y = Concept(id=1, name="happiness", data_iter=concept_iter2)
print_concept_sample(iter(y.data_iter))
experimental_sets=[[x, y]]
tcav = TCAV(pipe.model, layers=['model.layers.12.mlp.act_fn'])
input = "I am feeling very sad and depressed. I have no motivation and am very lonely"
# Tokenize the string and convert to indices
indexed_tokens = pipe.tokenizer.encode(input, padding='max_length', max_length=15)
# Convert the list of indices to a tensor
tokens_tensor = torch.tensor(indexed_tokens)
# interpret
tcav.interpret(tokens_tensor, experimental_sets)