__author__ = 'chunk' from . import * from ..mfeat import HOG, IntraBlockDiff from ..mspark import SC from ..common import * import os, sys from PIL import Image from hashlib import md5 import csv import shutil import json import collections import happybase from ..mjpeg import * from ..msteg import * import numpy as np from numpy.random import randn import pandas as pd from scipy import stats from subprocess import Popen, PIPE, STDOUT np.random.seed(sum(map(ord, "whoami"))) package_dir = os.path.dirname(os.path.abspath(__file__)) class DataILSVRC(DataDumperBase): def __init__(self, base_dir='/media/chunk/Elements/D/data/ImageNet/img/ILSVRC2013_DET_val', category='Train'): DataDumperBase.__init__(self, base_dir, category) self.base_dir = base_dir self.category = category self.data_dir = os.path.join(self.base_dir, self.category) self.dst_dir = os.path.join(self.base_dir, 'dst', self.category) self.list_file = os.path.join(self.dst_dir, 'file-tag.tsv') self.feat_dir = os.path.join(self.dst_dir, 'Feat') self.img_dir = os.path.join(self.dst_dir, 'Img') self.dict_data = {} self.table_name = self.base_dir.strip('/').split('/')[-1] + '-' + self.category self.sparkcontex = None def format(self): self.extract() def _hash_copy(self, image): if not image.endswith('jpg'): img = Image.open(image) img.save('../res/tmp.jpg', format='JPEG') image = '../res/tmp.jpg' with open(image, 'rb') as f: index = md5(f.read()).hexdigest() im = Jpeg(image, key=sample_key) self.dict_data[index] = [im.image_width, im.image_height, im.image_width * im.image_height, im.getCapacity(), im.getQuality()] # self.dict_data[index] = [im.image_width, im.image_height, os.path.getsize(image), im.getQuality()] # origion: # dir = base_dir + 'Img/Train/' + index[:3] dir = os.path.join(self.img_dir, index[:3]) if not os.path.exists(dir): os.makedirs(dir) image_path = os.path.join(dir, index[3:] + '.jpg') # print image_path if not os.path.exists(image_path): shutil.copy(image, image_path) else: pass def _build_list(self, list_file=None): if list_file == None: list_file = self.list_file assert list_file != None ordict_img = collections.OrderedDict(sorted(self.dict_data.items(), key=lambda d: d[0])) with open(list_file, 'w') as f: tsvfile = csv.writer(f, delimiter='\t') for key, value in ordict_img.items(): tsvfile.writerow([key] + value) def _anaylis(self, list_file=None): if list_file == None: list_file = self.list_file assert list_file != None df_ILS = pd.read_csv(list_file, names=['hash', 'width', 'height', 'size', 'capacity','quality'], sep='\t') length = df_ILS.shape[0] df_ILS = df_ILS.sort(['size', 'quality'], ascending=True) rand_class = stats.bernoulli.rvs(0.3, size=length) df_ILS['chosen'] = rand_class df_ILS['class'] = np.zeros(length, np.int32) df_ILS.to_csv(list_file, header=False, index=False, sep='\t') def extract(self): for path, subdirs, files in os.walk(self.data_dir): for name in files: imagepath = os.path.join(path, name) # print imagepath try: self._hash_copy(imagepath) except: pass self._build_list() self._anaylis() def embed(self): self.dict_data = {} dict_embedresult = {} os.environ["CLASSPATH"] = os.path.join(package_dir, "../libs/F5/") cmd = 'java Embed %s %s -e %s -p password -c "stegan by chunk " -q %d' df_ILS = pd.read_csv(self.list_file, names=['hash', 'width', 'height', 'size', 'quality', 'chosen', 'class'], sep='\t') df_ILS_TARGET = df_ILS[df_ILS['chosen'] == 1] for hash, size, quality in zip(df_ILS_TARGET['hash'], df_ILS_TARGET['size'], df_ILS_TARGET['quality']): path_img = os.path.join(self.img_dir, hash[:3], hash[3:] + '.jpg') if path_img: print path_img p = Popen(cmd % (path_img, 'res/tmp.jpg', quality), shell=True, stdout=PIPE, stderr=STDOUT) dict_embedresult[hash] = [line.strip('\n') for line in p.stdout.readlines()] try: self._hash_copy('res/tmp.jpg') except: pass with open(self.list_file + '.embed.log', 'wb') as f: tsvfile = csv.writer(f, delimiter='\t') for key, value in dict_embedresult.items(): tsvfile.writerow([key] + value) self._build_list(self.list_file + '.embed') # merge df_ILS_EMBED = pd.read_csv(self.list_file + '.embed', names=['hash', 'width', 'height', 'size', 'quality'], sep='\t') length = df_ILS_EMBED.shape[0] df_ILS_EMBED = df_ILS_EMBED.sort(['size', 'quality'], ascending=True) df_ILS_EMBED['chosen'] = np.zeros(length, np.int32) df_ILS_EMBED['class'] = np.ones(length, np.int32) df_ILS = df_ILS.append(df_ILS_EMBED, ignore_index=True) df_ILS.to_csv(self.list_file, header=False, index=False, sep='\t') def get_table(self): if self.table != None: return self.table if self.connection is None: c = happybase.Connection('HPC-server') self.connection = c tables = self.connection.tables() if self.table_name not in tables: families = {'cf_pic': dict(), 'cf_info': dict(max_versions=10), 'cf_tag': dict(), 'cf_feat': dict(), } self.connection.create_table(name=self.table_name, families=families) table = self.connection.table(name=self.table_name) self.table = table return table def store_image(self): if self.table == None: self.table = self.get_table() dict_databuf = {} with open(self.list_file, 'rb') as tsvfile: tsvfile = csv.reader(tsvfile, delimiter='\t') for line in tsvfile: path_img = os.path.join(self.img_dir, line[0][:3], line[0][3:] + '.jpg') if path_img: with open(path_img, 'rb') as fpic: dict_databuf[line[0] + '.jpg'] = fpic.read() try: with self.table.batch(batch_size=5000) as b: for imgname, imgdata in dict_databuf.items(): b.put(imgname, {'cf_pic:data': imgdata}) except ValueError: raise pass def store_info(self, infotype='all'): if self.table == None: self.table = self.get_table() dict_infobuf = {} with open(self.list_file, 'rb') as tsvfile: tsvfile = csv.reader(tsvfile, delimiter='\t') for line in tsvfile: dict_infobuf[line[0] + '.jpg'] = line[1:-2] if infotype == 'all': try: with self.table.batch(batch_size=5000) as b: for imgname, imginfo in dict_infobuf.items(): b.put(imgname, {'cf_info:width': imginfo[0], 'cf_info:height': imginfo[1], 'cf_info:size': imginfo[2], 'cf_info:quality': imginfo[3]}) except ValueError: raise pass else: raise Exception("Unknown infotype!") def store_tag(self, tagtype='all'): if self.table == None: self.table = self.get_table() dict_tagbuf = {} with open(self.list_file, 'rb') as tsvfile: tsvfile = csv.reader(tsvfile, delimiter='\t') for line in tsvfile: dict_tagbuf[line[0] + '.jpg'] = line[-2:] if tagtype == 'all': try: with self.table.batch(batch_size=5000) as b: for imgname, imgtag in dict_tagbuf.items(): b.put(imgname, {'cf_tag:chosen': imgtag[0], 'cf_tag:class': imgtag[1]}) except ValueError: raise pass else: raise Exception("Unknown tagtype!") def get_feat(self, image, feattype='ibd', **kwargs): size = kwargs.get('size', (48, 48)) if feattype == 'hog': feater = HOG.FeatHOG(size=size) elif feattype == 'ibd': feater = IntraBlockDiff.FeatIntraBlockDiff() else: raise Exception("Unknown feature type!") desc = feater.feat(image) return desc def extract_feat(self, feattype='ibd'): if feattype == 'hog': feater = HOG.FeatHOG(size=(48, 48)) elif feattype == 'ibd': feater = IntraBlockDiff.FeatIntraBlockDiff() else: raise Exception("Unknown feature type!") list_image = [] with open(self.list_file, 'rb') as tsvfile: tsvfile = csv.reader(tsvfile, delimiter='\t') for line in tsvfile: list_image.append(line[0]) dict_featbuf = {} for imgname in list_image: # if imgtag == 'True': image = os.path.join(self.img_dir, imgname[:3], imgname[3:] + '.jpg') desc = feater.feat(image) dict_featbuf[imgname] = desc for imgname, desc in dict_featbuf.items(): # print imgname, desc dir = os.path.join(self.feat_dir, imgname[:3]) if not os.path.exists(dir): os.makedirs(dir) featpath = os.path.join(dir, imgname[3:].split('.')[0] + '.' + feattype) with open(featpath, 'wb') as featfile: featfile.write(json.dumps(desc.tolist())) def store_feat(self, feattype='ibd'): if self.table == None: self.table = self.get_table() dict_featbuf = {} for path, subdirs, files in os.walk(self.feat_dir): for name in files: featpath = os.path.join(path, name) # print featpath with open(featpath, 'rb') as featfile: imgname = path.split('/')[-1] + name.replace('.' + feattype, '.jpg') dict_featbuf[imgname] = featfile.read() try: with self.table.batch(batch_size=5000) as b: for imgname, featdesc in dict_featbuf.items(): b.put(imgname, {'cf_feat:' + feattype: featdesc}) except ValueError: raise pass def load_data(self, mode='local', feattype='ibd', tagtype='class'): INDEX = [] X = [] Y = [] if mode == "local": dict_dataset = {} with open(self.list_file, 'rb') as tsvfile: tsvfile = csv.reader(tsvfile, delimiter='\t') for line in tsvfile: hash = line[0] tag = line[-1] path_feat = os.path.join(self.feat_dir, hash[:3], hash[3:] + '.' + feattype) if path_feat: with open(path_feat, 'rb') as featfile: dict_dataset[hash] = (tag, json.loads(featfile.read())) for tag, feat in dict_dataset.values(): X.append([item for sublist in feat for subsublist in sublist for item in subsublist]) Y.append(float(tag)) elif mode == "remote" or mode == "hbase": if self.table == None: self.table = self.get_table() col_feat, col_tag = 'cf_feat:' + feattype, 'cf_tag:' + tagtype for key, data in self.table.scan(columns=[col_feat, col_tag]): X.append(json.loads(data[col_feat])) Y.append(1 if data[col_tag] == 'True' else 0) elif mode == "spark" or mode == "cluster": if self.sparkcontex == None: self.sparkcontex = SC.Sparker(host='HPC-server', appname='ImageCV', master='spark://HPC-server:7077') result = self.sparkcontex.read_habase(self.table_name) # result = {key:[feat,tag],...} for feat, tag in result: X.append(feat) Y.append(tag) else: raise Exception("Unknown mode!") return X, Y