Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 2 additions & 0 deletions scenes/.gitignore
Original file line number Diff line number Diff line change
@@ -0,0 +1,2 @@
data/*.model
chainer_env/
Binary file added scenes/__pycache__/translator.cpython-37.pyc
Binary file not shown.
64 changes: 64 additions & 0 deletions scenes/code/CaptionDataLoader.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,64 @@
#!/usr/bin/env python
# -*- coding: utf-8 -*-

#class to get the data in a batch way
#loading on memory option (preload_all_features) took 6m10.400s (user time = 2m41.546s) to load if it is true

import numpy as np

class CaptionDataLoader(object):
def __init__(self, captions,image_feature_path,preload_all_features=False,filename_img_id=False):
self.captions = captions
self.image_feature_path=image_feature_path#path before image id. e.g. ../data/MSCOCO/train2014_ResNet50_features/COCO_train2014_
self.caption_ids = captions.keys()
self.random_indicies = np.random.permutation(len(self.captions))
self.index_count=0
self.epoch=1
self.preload_all_features=preload_all_features
self.filename_img_id=filename_img_id
if self.preload_all_features:
if self.filename_img_id:
self.image_features=np.array([np.load("%s/%s.npz"%(self.image_feature_path,self.captions[caption_id]["image_id"]))['arr_0'] for caption_id in self.caption_ids])
else:
self.image_features=np.array([np.load("%s%012d.npz"%(self.image_feature_path,self.captions[caption_id]["image_id"]))['arr_0'] for caption_id in self.caption_ids])

def get_batch(self,batch_size):
batch_data_indicies=self.random_indicies[self.index_count:self.index_count+batch_size]
self.index_count+=batch_size
if self.index_count > len(self.captions):
self.epoch+=1
self.suffle_data()
self.index_count=0

#sorry the following lines are so complicated...
#this is just loading preprocessed images features and captions for this batch
if self.preload_all_features:
batch_image_features=self.image_features[batch_data_indicies]
else:
if self.filename_img_id:
batch_image_features=np.array([np.load("%s/%s.npz"%(self.image_feature_path,self.captions[self.caption_ids[i]]["image_id"]))['arr_0'] for i in batch_data_indicies])
else:
batch_image_features=np.array([np.load("%s%012d.npz"%(self.image_feature_path,self.captions[self.caption_ids[i]]["image_id"]))['arr_0'] for i in batch_data_indicies])

batch_word_indices=[np.array(self.captions[self.caption_ids[i]]["token_ids"],dtype=np.int32) for i in batch_data_indicies]

return batch_image_features,batch_word_indices

def suffle_data(self):
self.random_indicies = np.random.permutation(len(self.captions))


if __name__ == '__main__':
#test code
import json
with open("../data/MSCOCO/mscoco_caption_train2014_processed.json", 'r') as f:
captions = json.load(f)
dataset=CaptionDataLoader(captions,image_feature_path="../data/MSCOCO/train2014_ResNet50_features/COCO_train2014_")
batch_image_features,batch_word_indices = dataset.get_batch(10)
print(batch_word_indices)
print(batch_image_features.shape)

dataset=CaptionDataLoader(captions,image_feature_path="../data/MSCOCO/train2014_ResNet50_features/COCO_train2014_",preload_all_features=True)
batch_image_features,batch_word_indices = dataset.get_batch(10)
print(batch_word_indices)
print(batch_image_features.shape)
76 changes: 76 additions & 0 deletions scenes/code/CaptionDataLoader2.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,76 @@
#!/usr/bin/env python
# -*- coding: utf-8 -*-

#class to get the data in a batch way
#loading on memory option (preload_all_features) took 6m10.400s (user time = 2m41.546s) to load if it is true
#refactered version of CaptionDataLoader.py

import numpy as np
import os
from image_loader import Image_loader
from ResNet50 import ResNet

class CaptionDataLoader(object):
def __init__(self,dataset,image_feature_root,image_root="",preload_all_features=False,image_mean="imagenet",holding_raw_captions=False):
self.holding_raw_captions=holding_raw_captions
self.image_loader=Image_loader(mean=image_mean)
self.captions=dataset["captions"]
self.num_captions=len(self.captions)
self.images=dataset["images"]
self.caption2image={caption["idx"]:caption["image_idx"] for caption in dataset["captions"]}
self.image_feature_root=image_feature_root+"/"#path to preprocessed image features. It assume the feature are stored with the same name but only extension is changed to .npz
self.image_root=image_root+"/"#path to image directory
self.random_indicies = np.random.permutation(len(self.captions))
self.index_count=0
self.epoch=1
self.preload_all_features=preload_all_features
if self.preload_all_features:
self.image_features=np.array([np.load("%s/%s.npz"%(self.image_feature_root, os.path.splitext(image["file_path"])[0] ))['arr_0'] for image in self.images])

def get_batch(self,batch_size,raw_image=False):
#if raw_image is true, it will give you Batchx3x224x224 otherwise it will be just features
batch_caption_indicies=self.random_indicies[self.index_count:self.index_count+batch_size]
self.index_count+=batch_size
if self.index_count > len(self.captions):
self.epoch+=1
self.suffle_data()
self.index_count=0

#sorry the following lines are so complicated...
#this is just loading preprocessed images or image features and captions for this batch
if raw_image:
batch_images= np.array( [self.image_loader.load(self.image_root+self.images[self.caption2image[i]]["file_path"],expand_batch_dim=False) for i in batch_caption_indicies] )
else:
if self.preload_all_features:
batch_images=self.image_features[[self.caption2image[i] for i in batch_caption_indicies]]
else:
batch_images=np.array([np.load("%s/%s.npz"%(self.image_feature_root, os.path.splitext(self.images[self.caption2image[i]]["file_path"])[0] ))['arr_0'] for i in batch_caption_indicies])
if self.holding_raw_captions:
batch_word_indices=[self.captions[i]["caption"] for i in batch_caption_indicies]
else:
batch_word_indices=[np.array(self.captions[i]["caption"],dtype=np.int32) for i in batch_caption_indicies]

return batch_images,batch_word_indices

def suffle_data(self):
self.random_indicies = np.random.permutation(len(self.captions))


if __name__ == '__main__':
#test code
import json
with open("../data/MSCOCO/mscoco_train2014_all_preprocessed.json", 'r') as f:
captions = json.load(f)
dataset=CaptionDataLoader(captions,image_feature_root="../data/MSCOCO/MSCOCO_ResNet50_features/",image_root="../data/MSCOCO/MSCOCO_raw_images/")
batch_images,batch_word_indices = dataset.get_batch(10,raw_image=True)
print(batch_word_indices)
print(batch_images)

batch_image_features,batch_word_indices = dataset.get_batch(10)
print(batch_word_indices)
print(batch_image_features.shape)

dataset=CaptionDataLoader(captions,image_feature_root="../data/MSCOCO/MSCOCO_ResNet50_features",preload_all_features=True)
batch_image_features,batch_word_indices = dataset.get_batch(10)
print(batch_word_indices)
print(batch_image_features.shape)
129 changes: 129 additions & 0 deletions scenes/code/CaptionEvaluater.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,129 @@
#!/usr/bin/env python
# -*- coding: utf-8 -*-

'''
evaluate captions by Blue, Rouge, and Cider

This can be used for any langauge assumning that the inputs are already segmented (i.e. tokenized).
The ground truth should be lowered before passing to here..
'''

import sys
sys.path.append('../coco-caption/')
sys.path.append('../coco-caption/pycocoevalcap/')
sys.path.append('./coco-caption/')
sys.path.append('./coco-caption/pycocoevalcap/')
from bleu.bleu import Bleu
from rouge.rouge import Rouge
from cider.cider import Cider
import string
import re

class CaptionEvaluater(object):
def __init__(self,):
self.blue_scorer=Bleu(4)
self.rouge_scorer=Rouge()
self.cider_scorer=Cider()
self.truth=None
remove = string.punctuation+"、。,."
self.remove_pattern = r"[{}]".format(remove) # create the pattern

def remove_punctuation(self,line):
#I am not sure how unicode works in python, so just in case.
line=line.replace(u"<unk>","")
line=line.replace("<unk>","")
line=line.replace(u"。","")
line=line.replace('\u3002',"")
return re.sub(self.remove_pattern, "", line)

def trnasform_utf8(self,line):
# return u' '.join(line).encode('utf-8').strip()
return line

def set_ground_truth(self,ground_truth):
'''
ground_truth should be a python dictonary whose shape is;
{"image_identifier": ["a caption", "a similar caption", ...], ...}
"image_identifier" can be either string or number.
'''
for img in ground_truth:
# ground_truth[img]=map(self.trnasform_utf8,ground_truth[img])
ground_truth[img]=map(self.remove_punctuation,ground_truth[img])
self.truth=ground_truth

def evaluate(self,predicetd_captions):
'''
predicetd_captions should be a python dictonary whose shape is;
{"image_identifier": ["the prediced caption"], ...}
"image_identifier" need to be same as used in ground truth.
make sure the number of caption is only one, even though it uses python list.
'''
for img in predicetd_captions:
# predicetd_captions[img]=map(self.trnasform_utf8,predicetd_captions[img])
predicetd_captions[img]=map(self.remove_punctuation,predicetd_captions[img])

results={}
for i,score in enumerate(self.get_bleu(predicetd_captions)[0]):
results["bleu-%d"%i]=score
results["rouge"] = self.get_rouge(predicetd_captions)[0]
results["cider"] = self.get_cider(predicetd_captions)[0]

return results

def get_bleu(self,predicetd_captions):
score, scores = self.blue_scorer.compute_score(self.truth, predicetd_captions)
#output is a python list [bleu-1,bleu-2,bleu-3,bleu-4]
return score, scores

def get_rouge(self,predicetd_captions):
score, scores = self.rouge_scorer.compute_score(self.truth, predicetd_captions)
return score, scores

def get_cider(self,predicetd_captions):
score, scores = self.cider_scorer.compute_score(self.truth, predicetd_captions)
return score, scores

if __name__ == '__main__':
#test
#spaceで区切ったのを入れればOK.
ground_truth={}
ground_truth['262148'] = ['オレンジ色 の シャツ を 着た 人 が います',
'オレンジ色 の Tシャツ を 着ている 人 が 立って います',
'人 が オレンジ色 の シャツ を 着て 立って います',
]
ground_truth[262148] = ['the skateboarder is putting on a show using the picnic table as his stage',
'a skateboarder pulling tricks on top of a picnic table',
'a man riding on a skateboard on top of a table',
'a skate boarder doing a trick on a picnic table',
'a person is riding a skateboard on a picnic table with a crowd watching']
ground_truth[393225]= ['a bowl of soup that has some carrots shrimp and noodles in it',
'the healthy food is in the bowl and ready to eat',
'soup has carrots and shrimp in it as it sits next to chopsticks',
'a tasty bowl of ramen is served for someone to enjoy',
'bowl of asian noodle soup with shrimp and carrots']
ground_truth[1] = ['the skateboarder is putting on a show using the picnic table as his stage',
'a skateboarder pulling tricks on top of a picnic table',
'a man riding on a skateboard on top of a table',
'a skate boarder doing a trick on a picnic table',
'a person is riding a skateboard on a picnic table with a crowd watching']
ground_truth[2]= ['a bowl of soup that has some carrots shrimp and noodles in it',
'the healthy food is in the bowl and ready to eat',
'soup has carrots and shrimp in it as it sits next to chopsticks',
'a tasty bowl of ramen is served for someone to enjoy',
'bowl of asian noodle soup with shrimp and carrots']


#prediceted は一つだけじゃないとダメ
predicted={}
predicted['262148']=['人 が オレンジ色 の シャツ を 着て 立って <unk> います。']
predicted[262148]=['A man riding a skateboard down a ramp。']
predicted[393225]=['A bowl of soup with carrots and a spoon.']
predicted[1]=['a man riding a skateboard down a ramp。']
predicted[2]=['a bowl of soup with carrots and a spoon、<unk>']
#keyは数字でも文字列でもどっちでもいいけど、ground truth と predicedで対応が取れるように!

evaluater=CaptionEvaluater()
evaluater.set_ground_truth(ground_truth)
print(evaluater.evaluate(predicted))
#https://github.com/tylin/coco-caption/issues/5
#Yes, CIDEr can have values till 10 (technically).
Loading