canesee-project · Rawan19 · Mar 6, 2020
diff --git a/scenes/.gitignore b/scenes/.gitignore
@@ -0,0 +1,2 @@
+data/*.model
+chainer_env/
diff --git a/scenes/__pycache__/translator.cpython-37.pyc b/scenes/__pycache__/translator.cpython-37.pyc
diff --git a/scenes/code/CaptionDataLoader.py b/scenes/code/CaptionDataLoader.py
@@ -0,0 +1,64 @@
+#!/usr/bin/env python
+# -*- coding: utf-8 -*-
+
+#class to get the data in a batch way
+#loading on memory option (preload_all_features) took  6m10.400s (user time = 2m41.546s) to load  if it is true
+
+import numpy as np
+
+class CaptionDataLoader(object):
+    def __init__(self, captions,image_feature_path,preload_all_features=False,filename_img_id=False):
+        self.captions = captions
+        self.image_feature_path=image_feature_path#path before image id. e.g. ../data/MSCOCO/train2014_ResNet50_features/COCO_train2014_
+        self.caption_ids = captions.keys()
+        self.random_indicies = np.random.permutation(len(self.captions))
+        self.index_count=0
+        self.epoch=1
+        self.preload_all_features=preload_all_features
+        self.filename_img_id=filename_img_id
+        if  self.preload_all_features:
+            if self.filename_img_id:
+                self.image_features=np.array([np.load("%s/%s.npz"%(self.image_feature_path,self.captions[caption_id]["image_id"]))['arr_0'] for caption_id in self.caption_ids])
+            else:
+                self.image_features=np.array([np.load("%s%012d.npz"%(self.image_feature_path,self.captions[caption_id]["image_id"]))['arr_0'] for caption_id in self.caption_ids])
+
+    def get_batch(self,batch_size):
+        batch_data_indicies=self.random_indicies[self.index_count:self.index_count+batch_size]
+        self.index_count+=batch_size
+        if self.index_count > len(self.captions):
+            self.epoch+=1
+            self.suffle_data()
+            self.index_count=0
+
+        #sorry the following lines are so complicated...
+        #this is just loading preprocessed images features and captions for this batch
+        if self.preload_all_features:
+            batch_image_features=self.image_features[batch_data_indicies]
+        else:
+            if self.filename_img_id:
+                batch_image_features=np.array([np.load("%s/%s.npz"%(self.image_feature_path,self.captions[self.caption_ids[i]]["image_id"]))['arr_0'] for i in batch_data_indicies])
+            else:
+                batch_image_features=np.array([np.load("%s%012d.npz"%(self.image_feature_path,self.captions[self.caption_ids[i]]["image_id"]))['arr_0'] for i in batch_data_indicies])
+
+        batch_word_indices=[np.array(self.captions[self.caption_ids[i]]["token_ids"],dtype=np.int32) for i in batch_data_indicies]
+
+        return batch_image_features,batch_word_indices
+
+    def suffle_data(self):
+        self.random_indicies = np.random.permutation(len(self.captions))
+
+
+if __name__ == '__main__':
+    #test code
+    import json
+    with open("../data/MSCOCO/mscoco_caption_train2014_processed.json", 'r') as f:
+        captions = json.load(f)
+    dataset=CaptionDataLoader(captions,image_feature_path="../data/MSCOCO/train2014_ResNet50_features/COCO_train2014_")
+    batch_image_features,batch_word_indices =  dataset.get_batch(10)
+    print(batch_word_indices)
+    print(batch_image_features.shape)
+
+    dataset=CaptionDataLoader(captions,image_feature_path="../data/MSCOCO/train2014_ResNet50_features/COCO_train2014_",preload_all_features=True)
+    batch_image_features,batch_word_indices =  dataset.get_batch(10)
+    print(batch_word_indices)
+    print(batch_image_features.shape)
diff --git a/scenes/code/CaptionDataLoader2.py b/scenes/code/CaptionDataLoader2.py
@@ -0,0 +1,76 @@
+#!/usr/bin/env python
+# -*- coding: utf-8 -*-
+
+#class to get the data in a batch way
+#loading on memory option (preload_all_features) took  6m10.400s (user time = 2m41.546s) to load  if it is true
+#refactered version of CaptionDataLoader.py
+
+import numpy as np
+import os
+from image_loader import Image_loader
+from ResNet50 import ResNet
+
+class CaptionDataLoader(object):
+    def __init__(self,dataset,image_feature_root,image_root="",preload_all_features=False,image_mean="imagenet",holding_raw_captions=False):
+        self.holding_raw_captions=holding_raw_captions
+        self.image_loader=Image_loader(mean=image_mean)
+        self.captions=dataset["captions"]
+        self.num_captions=len(self.captions)
+        self.images=dataset["images"]
+        self.caption2image={caption["idx"]:caption["image_idx"] for caption in dataset["captions"]}
+        self.image_feature_root=image_feature_root+"/"#path to preprocessed image features. It assume the feature are stored with the same name but only extension is changed to .npz
+        self.image_root=image_root+"/"#path to image directory
+        self.random_indicies = np.random.permutation(len(self.captions))
+        self.index_count=0
+        self.epoch=1
+        self.preload_all_features=preload_all_features
+        if  self.preload_all_features:
+            self.image_features=np.array([np.load("%s/%s.npz"%(self.image_feature_root, os.path.splitext(image["file_path"])[0] ))['arr_0'] for image in self.images])
+
+    def get_batch(self,batch_size,raw_image=False):
+        #if raw_image is true, it will give you Batchx3x224x224 otherwise it will be just features
+        batch_caption_indicies=self.random_indicies[self.index_count:self.index_count+batch_size]
+        self.index_count+=batch_size
+        if self.index_count > len(self.captions):
+            self.epoch+=1
+            self.suffle_data()
+            self.index_count=0
+
+        #sorry the following lines are so complicated...
+        #this is just loading preprocessed images or image features and captions for this batch
+        if raw_image:
+            batch_images= np.array( [self.image_loader.load(self.image_root+self.images[self.caption2image[i]]["file_path"],expand_batch_dim=False) for i in batch_caption_indicies] )
+        else:
+            if self.preload_all_features:
+                batch_images=self.image_features[[self.caption2image[i] for i in batch_caption_indicies]]
+            else:
+                batch_images=np.array([np.load("%s/%s.npz"%(self.image_feature_root, os.path.splitext(self.images[self.caption2image[i]]["file_path"])[0] ))['arr_0'] for i in batch_caption_indicies])
+        if self.holding_raw_captions:
+            batch_word_indices=[self.captions[i]["caption"] for i in batch_caption_indicies]
+        else:
+            batch_word_indices=[np.array(self.captions[i]["caption"],dtype=np.int32) for i in batch_caption_indicies]
+
+        return batch_images,batch_word_indices
+
+    def suffle_data(self):
+        self.random_indicies = np.random.permutation(len(self.captions))
+
+
+if __name__ == '__main__':
+    #test code
+    import json
+    with open("../data/MSCOCO/mscoco_train2014_all_preprocessed.json", 'r') as f:
+        captions = json.load(f)
+    dataset=CaptionDataLoader(captions,image_feature_root="../data/MSCOCO/MSCOCO_ResNet50_features/",image_root="../data/MSCOCO/MSCOCO_raw_images/")
+    batch_images,batch_word_indices =  dataset.get_batch(10,raw_image=True)
+    print(batch_word_indices)
+    print(batch_images)
+
+    batch_image_features,batch_word_indices =  dataset.get_batch(10)
+    print(batch_word_indices)
+    print(batch_image_features.shape)
+
+    dataset=CaptionDataLoader(captions,image_feature_root="../data/MSCOCO/MSCOCO_ResNet50_features",preload_all_features=True)
+    batch_image_features,batch_word_indices =  dataset.get_batch(10)
+    print(batch_word_indices)
+    print(batch_image_features.shape)
diff --git a/scenes/code/CaptionEvaluater.py b/scenes/code/CaptionEvaluater.py
@@ -0,0 +1,129 @@
+#!/usr/bin/env python
+# -*- coding: utf-8 -*-
+
+'''
+evaluate captions by Blue, Rouge, and Cider 
+
+This can be used for any langauge assumning that the inputs are already segmented (i.e. tokenized). 
+The ground truth should be lowered before passing to here..
+'''
+
+import sys
+sys.path.append('../coco-caption/')
+sys.path.append('../coco-caption/pycocoevalcap/')
+sys.path.append('./coco-caption/')
+sys.path.append('./coco-caption/pycocoevalcap/')
+from bleu.bleu import Bleu
+from rouge.rouge import Rouge
+from cider.cider import Cider
+import string
+import re
+
+class CaptionEvaluater(object):
+    def __init__(self,):
+        self.blue_scorer=Bleu(4)
+        self.rouge_scorer=Rouge()
+        self.cider_scorer=Cider()
+        self.truth=None
+        remove = string.punctuation+"、。，．"
+        self.remove_pattern = r"[{}]".format(remove) # create the pattern
+
+    def remove_punctuation(self,line):
+        #I am not sure how unicode works in python, so just in case.
+        line=line.replace(u"<unk>","")
+        line=line.replace("<unk>","")
+        line=line.replace(u"。","")
+        line=line.replace('\u3002',"")
+        return re.sub(self.remove_pattern, "", line) 
+
+    def trnasform_utf8(self,line):
+        # return u' '.join(line).encode('utf-8').strip()
+        return line
+
+    def set_ground_truth(self,ground_truth):
+        '''
+        ground_truth should be a python dictonary whose shape is; 
+            {"image_identifier": ["a caption", "a similar caption", ...], ...}
+        "image_identifier" can be either string or number.
+        '''
+        for img in ground_truth:
+            # ground_truth[img]=map(self.trnasform_utf8,ground_truth[img])
+            ground_truth[img]=map(self.remove_punctuation,ground_truth[img])
+        self.truth=ground_truth
+
+    def evaluate(self,predicetd_captions):
+        '''
+        predicetd_captions should be a python dictonary whose shape is; 
+            {"image_identifier": ["the prediced caption"], ...}
+        "image_identifier" need to be same as used in ground truth.
+        make sure the number of caption is only one, even though it uses python list. 
+        '''
+        for img in predicetd_captions:
+            # predicetd_captions[img]=map(self.trnasform_utf8,predicetd_captions[img])
+            predicetd_captions[img]=map(self.remove_punctuation,predicetd_captions[img])
+
+        results={}
+        for i,score in enumerate(self.get_bleu(predicetd_captions)[0]):
+            results["bleu-%d"%i]=score
+        results["rouge"] = self.get_rouge(predicetd_captions)[0]
+        results["cider"] = self.get_cider(predicetd_captions)[0]
+
+        return results
+
+    def get_bleu(self,predicetd_captions):
+        score, scores = self.blue_scorer.compute_score(self.truth, predicetd_captions) 
+        #output is a python list [bleu-1,bleu-2,bleu-3,bleu-4]
+        return score, scores
+
+    def get_rouge(self,predicetd_captions):
+        score, scores = self.rouge_scorer.compute_score(self.truth, predicetd_captions)
+        return score, scores
+
+    def get_cider(self,predicetd_captions):
+        score, scores = self.cider_scorer.compute_score(self.truth, predicetd_captions)
+        return score, scores
+
+if __name__ == '__main__':
+    #test
+    #spaceで区切ったのを入れればOK.
+    ground_truth={}
+    ground_truth['262148'] = ['オレンジ色 の シャツ を 着た 人 が います',
+      'オレンジ色 の Tシャツ を 着ている 人 が 立って います',
+      '人 が オレンジ色 の シャツ を 着て 立って います',
+    ]
+    ground_truth[262148] = ['the skateboarder is putting on a show using the picnic table as his stage',
+    'a skateboarder pulling tricks on top of a picnic table',
+    'a man riding on a skateboard on top of a table',
+    'a skate boarder doing a trick on a picnic table',
+    'a person is riding a skateboard on a picnic table with a crowd watching']
+    ground_truth[393225]= ['a bowl of soup that has some carrots shrimp and noodles in it',
+    'the healthy food is in the bowl and ready to eat',
+    'soup has carrots and shrimp in it as it sits next to chopsticks',
+    'a tasty bowl of ramen is served for someone to enjoy',
+    'bowl of asian noodle soup with shrimp and carrots']
+    ground_truth[1] = ['the skateboarder is putting on a show using the picnic table as his stage',
+    'a skateboarder pulling tricks on top of a picnic table',
+    'a man riding on a skateboard on top of a table',
+    'a skate boarder doing a trick on a picnic table',
+    'a person is riding a skateboard on a picnic table with a crowd watching']
+    ground_truth[2]= ['a bowl of soup that has some carrots shrimp and noodles in it',
+    'the healthy food is in the bowl and ready to eat',
+    'soup has carrots and shrimp in it as it sits next to chopsticks',
+    'a tasty bowl of ramen is served for someone to enjoy',
+    'bowl of asian noodle soup with shrimp and carrots']
+
+
+    #prediceted は一つだけじゃないとダメ
+    predicted={}
+    predicted['262148']=['人 が オレンジ色 の シャツ を 着て 立って <unk> います。']
+    predicted[262148]=['A man riding a skateboard down a ramp。']
+    predicted[393225]=['A bowl of soup with carrots and a spoon.']
+    predicted[1]=['a man riding a skateboard down a ramp。']
+    predicted[2]=['a bowl of soup with carrots and a spoon、<unk>']
+    #keyは数字でも文字列でもどっちでもいいけど、ground truth と predicedで対応が取れるように！
+
+    evaluater=CaptionEvaluater()
+    evaluater.set_ground_truth(ground_truth)
+    print(evaluater.evaluate(predicted))
+    #https://github.com/tylin/coco-caption/issues/5
+    #Yes, CIDEr can have values till 10 (technically).