__author__ = 'chunk' from mdata import * from mfeat import * import os, sys from PIL import Image from hashlib import md5 import csv import shutil from common import * import json import collections import happybase class DataCV(DataDumperBase): def __init__(self, base_dir='/home/hadoop/data/HeadShoulder/', sub_dir='Train/'): DataDumperBase.__init__(self) self.base_dir = base_dir self.sub_dir = sub_dir self.dict_data = {} def format(self): self.extract() def _hash_copy(self, image, ispos): if not image.endswith('jpg'): img = Image.open(image) img.save('res/tmp.jpg', format='JPEG') image = 'res/tmp.jpg' with open(image, 'rb') as f: index = md5(f.read()).hexdigest() self.dict_data[index] = ispos # origion: # dir = base_dir + 'Img/Train/' + index[:3] dir = self.base_dir + self.sub_dir + 'Img/' + index[:3] if not os.path.exists(dir): os.makedirs(dir) path = dir + '/' + index[3:] + '.jpg' print path if not os.path.exists(path): shutil.copy(image, path) else: pass def extract(self): for path, subdirs, files in os.walk(self.base_dir + 'Orig/'): for name in files: imagepath = os.path.join(path, name) print imagepath if imagepath.split('/')[-2].startswith('pos'): self._hash_copy(imagepath, True) else: self._hash_copy(imagepath, False) ordict_img = collections.OrderedDict(sorted(self.dict_data.items(), key=lambda d: d[0])) lstfile = self.base_dir + self.sub_dir + 'Img/Image.tsv' with open(lstfile, 'w') as f: tsvfile = csv.writer(f, delimiter='\t') for key, value in ordict_img.items(): tsvfile.writerow([key] + [value]) def get_table(self, tablename, connection=None): if connection is not None: c = connection else: c = happybase.Connection('HPC-server') tables = c.tables() if tablename not in tables: families = {'cf_pic': dict(), 'cf_info': dict(max_versions=10), 'cf_tag': dict(), 'cf_feat': dict(), } c.create_table(name=tablename, families=families) tb = c.table(name=tablename) return tb def store_image(self, table): timer.mark() dir = self.base_dir + self.sub_dir + 'Img/' maplst = dir + 'Image.tsv' dict_databuf = {} with open(maplst, 'rb') as tsvfile: tsvfile = csv.reader(tsvfile, delimiter='\t') for line in tsvfile: path_img = self.base_dir + self.sub_dir + 'Img/Train/' + line[0][:3] + '/' + line[0][3:] + '.jpg' if path_img: with open(path_img, 'rb') as fpic: dict_databuf[line[0] + '.jpg'] = fpic.read() timer.report() # 58.761801s timer.mark() try: with table.batch(batch_size=5000) as b: for imgname, imgdata in dict_databuf.items(): b.put(imgname, {'cf_pic:data': imgdata}) raise ValueError("Something went wrong!") except ValueError: pass timer.report() # 15.570524s def store_tag(self, table): timer.mark() dir = self.base_dir + self.sub_dir + 'Img/' maplst = dir + 'Image.tsv' dict_tagbuf = {} with open(maplst, 'rb') as tsvfile: tsvfile = csv.reader(tsvfile, delimiter='\t') for line in tsvfile: dict_tagbuf[line[0] + '.jpg'] = line[1] timer.report() # 0.009741s timer.mark() try: with table.batch(batch_size=5000) as b: for imgname, imgtag in dict_tagbuf.items(): b.put(imgname, {'cf_tag:class': imgtag}) raise ValueError("Something went wrong!") except ValueError: pass timer.report() # 0.509696s def get_feat(self, category='hog'): dir = self.base_dir + self.sub_dir + 'Img/' maplst = dir + 'images_map_Train.tsv' dict_tagbuf = {} with open(maplst, 'rb') as tsvfile: tsvfile = csv.reader(tsvfile, delimiter='\t') for line in tsvfile: dict_tagbuf[line[0] + '.jpg'] = line[1] dict_featbuf = {} timer.mark() for imgname, imgtag in dict_tagbuf.items(): # if imgtag == 'True': path_img = self.base_dir + self.sub_dir + 'Img/Train/' + imgname[:3] + '/' + imgname[3:] desc = FeatHOG.feat(path_img, size=(48, 48)) dict_featbuf[imgname] = desc timer.report() # 4.337425s timer.mark() for imgname, desc in dict_featbuf.items(): # print imgname, desc dir = self.base_dir + self.sub_dir + 'Feat/Train/' + imgname[:3] + '/' if not os.path.exists(dir): os.makedirs(dir) featpath = dir + imgname[3:].split('.')[0] + '.' + category with open(featpath, 'wb') as featfile: featfile.write(json.dumps(desc.tolist())) timer.report() # 14.862485s def store_feat(self, table): timer.mark() dir = self.base_dir + self.sub_dir + 'Feat/' dict_featbuf = {} for path, subdirs, files in os.walk(dir + 'Train/'): for name in files: featpath = os.path.join(path, name) # print featpath with open(featpath, 'rb') as featfile: imgname = path.split('/')[-1] + name.replace('.hog', '.jpg') dict_featbuf[imgname] = featfile.read() timer.report() # 0.577940s timer.mark() try: with table.batch(batch_size=5000) as b: for imgname, featdesc in dict_featbuf.items(): b.put(imgname, {'cf_feat:hog': featdesc}) raise ValueError("Something went wrong!") except ValueError: pass timer.report() # 76.075477s