Commit 24768a990fbda84a2df0b387178ce4039863d423
1 parent
f25fd27c
Exists in
master
and in
1 other branch
mode 'hbase' finished. (testing is sometimes interesting, especially when your c…
…ode is well structured and with few bugs! )
Showing
4 changed files
with
390 additions
and
374 deletions
Show diff stats
mdata/ILSVRC-S.py
| ... | ... | @@ -1,359 +0,0 @@ |
| 1 | -__author__ = 'chunk' | |
| 2 | - | |
| 3 | -from . import * | |
| 4 | -from ..mfeat import HOG, IntraBlockDiff | |
| 5 | -from ..mspark import SC | |
| 6 | -from ..common import * | |
| 7 | - | |
| 8 | -import os, sys | |
| 9 | -from PIL import Image | |
| 10 | -from hashlib import md5 | |
| 11 | -import csv | |
| 12 | -import shutil | |
| 13 | -import json | |
| 14 | -import collections | |
| 15 | -import happybase | |
| 16 | - | |
| 17 | -from ..mjpeg import * | |
| 18 | -from ..msteg import * | |
| 19 | -from ..msteg.steganography import LSB, F3, F4, F5 | |
| 20 | - | |
| 21 | -import numpy as np | |
| 22 | -from numpy.random import randn | |
| 23 | -import pandas as pd | |
| 24 | -from scipy import stats | |
| 25 | - | |
| 26 | -from subprocess import Popen, PIPE, STDOUT | |
| 27 | -import tempfile | |
| 28 | - | |
| 29 | -np.random.seed(sum(map(ord, "whoami"))) | |
| 30 | - | |
| 31 | -package_dir = os.path.dirname(os.path.abspath(__file__)) | |
| 32 | - | |
| 33 | - | |
| 34 | -class DataILSVRCS(DataDumperBase): | |
| 35 | - """ | |
| 36 | - This module is specially for ILSVRC data processing under spark & hbase. | |
| 37 | - | |
| 38 | - We posit that the DB(e.g. HBase) has only the images data with md5 name as id. | |
| 39 | - The task is to gennerate info(size,capacity,quality,etc.) and class & chosen tags, and then to perform embedding and finally to calcculate ibd features. | |
| 40 | - | |
| 41 | - Each step includes reading from & writing to Hbase (though PC). | |
| 42 | - And each step must have a 'spark' mode option, which means that the operation is performed by spark with reading & wrting through RDDs. | |
| 43 | - | |
| 44 | - chunkplus@gmail.com | |
| 45 | - """ | |
| 46 | - | |
| 47 | - def __init__(self, base_dir='/media/chunk/Elements/D/data/ImageNet/img/ILSVRC2013_DET_val', category='Train'): | |
| 48 | - DataDumperBase.__init__(self, base_dir, category) | |
| 49 | - | |
| 50 | - self.base_dir = base_dir | |
| 51 | - self.category = category | |
| 52 | - | |
| 53 | - self.dict_data = {} | |
| 54 | - | |
| 55 | - self.table_name = self.base_dir.strip('/').split('/')[-1] + '-' + self.category | |
| 56 | - self.sparkcontex = None | |
| 57 | - | |
| 58 | - | |
| 59 | - def _get_table(self): | |
| 60 | - if self.table != None: | |
| 61 | - return self.table | |
| 62 | - | |
| 63 | - if self.connection is None: | |
| 64 | - c = happybase.Connection('HPC-server') | |
| 65 | - self.connection = c | |
| 66 | - | |
| 67 | - tables = self.connection.tables() | |
| 68 | - if self.table_name not in tables: | |
| 69 | - families = {'cf_pic': dict(), | |
| 70 | - 'cf_info': dict(max_versions=10), | |
| 71 | - 'cf_tag': dict(), | |
| 72 | - 'cf_feat': dict(), | |
| 73 | - } | |
| 74 | - self.connection.create_table(name=self.table_name, families=families) | |
| 75 | - | |
| 76 | - table = self.connection.table(name=self.table_name) | |
| 77 | - | |
| 78 | - self.table = table | |
| 79 | - | |
| 80 | - return table | |
| 81 | - | |
| 82 | - def _get_info(self, img, info_rate=None, tag_chosen=None, tag_class=None): | |
| 83 | - """ | |
| 84 | - Tempfile is our friend. (?) | |
| 85 | - """ | |
| 86 | - info_rate = info_rate if info_rate != None else 0.0 | |
| 87 | - tag_chosen = tag_chosen if tag_chosen != None else stats.bernoulli.rvs(0.8) | |
| 88 | - tag_class = tag_class if tag_class != None else 0 | |
| 89 | - try: | |
| 90 | - tmpf = tempfile.NamedTemporaryFile(suffix='.jpg', mode='w+b') | |
| 91 | - tmpf.write(img) | |
| 92 | - # tmpf.seek(0) | |
| 93 | - im = Jpeg(tmpf.name, key=sample_key) | |
| 94 | - info = [im.image_width, | |
| 95 | - im.image_height, | |
| 96 | - im.image_width * im.image_height, | |
| 97 | - im.getCapacity(), | |
| 98 | - im.getQuality(), | |
| 99 | - info_rate, | |
| 100 | - tag_chosen, | |
| 101 | - tag_class] | |
| 102 | - return info | |
| 103 | - except Exception as e: | |
| 104 | - print e | |
| 105 | - finally: | |
| 106 | - tmpf.close() | |
| 107 | - | |
| 108 | - def _get_feat(self, image, feattype='ibd', **kwargs): | |
| 109 | - size = kwargs.get('size', (48, 48)) | |
| 110 | - | |
| 111 | - if feattype == 'hog': | |
| 112 | - feater = HOG.FeatHOG(size=size) | |
| 113 | - elif feattype == 'ibd': | |
| 114 | - feater = IntraBlockDiff.FeatIntraBlockDiff() | |
| 115 | - else: | |
| 116 | - raise Exception("Unknown feature type!") | |
| 117 | - | |
| 118 | - desc = feater.feat(image) | |
| 119 | - | |
| 120 | - return desc | |
| 121 | - | |
| 122 | - def _extract_data(self, mode='hbase', writeback=False): | |
| 123 | - """ | |
| 124 | - Get info barely out of image data. | |
| 125 | - """ | |
| 126 | - if mode == 'hbase': | |
| 127 | - if self.table == None: | |
| 128 | - self.table = self.get_table() | |
| 129 | - | |
| 130 | - cols = ['cf_pic:data'] | |
| 131 | - for key, data in self.table.scan(columns=cols, scan_batching=True): | |
| 132 | - self.dict_data[key] = [data] + self._get_info(data) | |
| 133 | - | |
| 134 | - if not writeback: | |
| 135 | - return self.dict_data | |
| 136 | - else: | |
| 137 | - try: | |
| 138 | - with self.table.batch(batch_size=5000) as b: | |
| 139 | - for imgname, imginfo in self.dict_data.items(): | |
| 140 | - b.put(imgname, | |
| 141 | - { | |
| 142 | - # 'cf_pic:data': imginfo[0], | |
| 143 | - 'cf_info:width': imginfo[1], | |
| 144 | - 'cf_info:height': imginfo[2], | |
| 145 | - 'cf_info:size': imginfo[3], | |
| 146 | - 'cf_info:capacity': imginfo[4], | |
| 147 | - 'cf_info:quality': imginfo[5], | |
| 148 | - 'cf_info:rate': imginfo[6], | |
| 149 | - 'cf_tag:chosen': imginfo[7], | |
| 150 | - 'cf_tag:class': imginfo[8], }) | |
| 151 | - except ValueError: | |
| 152 | - raise | |
| 153 | - | |
| 154 | - | |
| 155 | - elif mode == 'spark': | |
| 156 | - pass | |
| 157 | - else: | |
| 158 | - raise Exception("Unknown mode!") | |
| 159 | - | |
| 160 | - | |
| 161 | - def _embed_data(self, mode='hbase', rate=None, readforward=False, writeback=False): | |
| 162 | - f5 = F5.F5(sample_key, 1) | |
| 163 | - if mode == 'hbase': | |
| 164 | - if self.table == None: | |
| 165 | - self.table = self.get_table() | |
| 166 | - | |
| 167 | - if readforward: | |
| 168 | - self.dict_data = {} | |
| 169 | - cols = ['cf_pic:data', | |
| 170 | - 'cf_info:width', | |
| 171 | - 'cf_info:height', | |
| 172 | - 'cf_info:size', | |
| 173 | - 'cf_info:capacity', | |
| 174 | - 'cf_info:quality', | |
| 175 | - 'cf_info:rate', | |
| 176 | - 'cf_tag:chosen', | |
| 177 | - 'cf_tag:class'] | |
| 178 | - for key, data in self.table.scan(columns=cols, scan_batching=True): | |
| 179 | - self.dict_data[key] = data | |
| 180 | - | |
| 181 | - dict_data_ext = {} | |
| 182 | - | |
| 183 | - for imgname, imgdata in self.dict_data.items(): | |
| 184 | - try: | |
| 185 | - tmpf_src = tempfile.NamedTemporaryFile(suffix='.jpg', mode='w+b') | |
| 186 | - tmpf_src.write(imgdata[0]) | |
| 187 | - tmpf_dst = tempfile.NamedTemporaryFile(suffix='.jpg', mode='w+b') | |
| 188 | - | |
| 189 | - if rate == None: | |
| 190 | - embed_rate = f5.embed_raw_data(tmpf_src, os.path.join(package_dir, '../res/toembed'), tmpf_dst) | |
| 191 | - else: | |
| 192 | - assert (rate >= 0 and rate < 1) | |
| 193 | - # print capacity | |
| 194 | - hidden = np.random.bytes(int(imgdata[4] * rate) / 8) | |
| 195 | - embed_rate = f5.embed_raw_data(tmpf_src, hidden, tmpf_dst, frommem=True) | |
| 196 | - | |
| 197 | - tmpf_dst.seek(0) | |
| 198 | - raw = tmpf_dst.read() | |
| 199 | - index = md5(raw).hexdigest() | |
| 200 | - dict_data_ext[index + '.jpg'] = [raw] + self._get_info(raw, embed_rate, 0, 1) | |
| 201 | - | |
| 202 | - | |
| 203 | - except Exception as e: | |
| 204 | - print e | |
| 205 | - finally: | |
| 206 | - tmpf_src.close() | |
| 207 | - tmpf_dst.close() | |
| 208 | - | |
| 209 | - self.dict_data.update(dict_data_ext) | |
| 210 | - | |
| 211 | - if not writeback: | |
| 212 | - return self.dict_data | |
| 213 | - else: | |
| 214 | - try: | |
| 215 | - with self.table.batch(batch_size=5000) as b: | |
| 216 | - for imgname, imginfo in dict_data_ext.items(): | |
| 217 | - b.put(imgname, | |
| 218 | - { | |
| 219 | - 'cf_pic:data': imginfo[0], | |
| 220 | - 'cf_info:width': imginfo[1], | |
| 221 | - 'cf_info:height': imginfo[2], | |
| 222 | - 'cf_info:size': imginfo[3], | |
| 223 | - 'cf_info:capacity': imginfo[4], | |
| 224 | - 'cf_info:quality': imginfo[5], | |
| 225 | - 'cf_info:rate': imginfo[6], | |
| 226 | - 'cf_tag:chosen': imginfo[7], | |
| 227 | - 'cf_tag:class': imginfo[8], }) | |
| 228 | - except ValueError: | |
| 229 | - raise | |
| 230 | - | |
| 231 | - elif mode == 'spark': | |
| 232 | - pass | |
| 233 | - else: | |
| 234 | - raise Exception("Unknown mode!") | |
| 235 | - | |
| 236 | - | |
| 237 | - def _extract_feat(self, mode='hbase', feattype='ibd', readforward=False, writeback=False, **kwargs): | |
| 238 | - if mode == 'hbase': | |
| 239 | - if self.table == None: | |
| 240 | - self.table = self.get_table() | |
| 241 | - | |
| 242 | - if readforward: | |
| 243 | - self.dict_data = {} | |
| 244 | - cols = ['cf_pic:data', | |
| 245 | - 'cf_info:width', | |
| 246 | - 'cf_info:height', | |
| 247 | - 'cf_info:size', | |
| 248 | - 'cf_info:capacity', | |
| 249 | - 'cf_info:quality', | |
| 250 | - 'cf_info:rate', | |
| 251 | - 'cf_tag:chosen', | |
| 252 | - 'cf_tag:class'] | |
| 253 | - for key, data in self.table.scan(columns=cols, scan_batching=True): | |
| 254 | - self.dict_data[key] = data | |
| 255 | - | |
| 256 | - for imgname, imgdata in self.dict_data.items(): | |
| 257 | - try: | |
| 258 | - tmpf_src = tempfile.NamedTemporaryFile(suffix='.jpg', mode='w+b') | |
| 259 | - tmpf_src.write(imgdata[0]) | |
| 260 | - | |
| 261 | - desc = json.dumps(self._get_feat(tmpf_src, feattype=feattype)) | |
| 262 | - | |
| 263 | - self.dict_data[imgname].append(desc) | |
| 264 | - | |
| 265 | - except Exception as e: | |
| 266 | - print e | |
| 267 | - finally: | |
| 268 | - tmpf_src.close() | |
| 269 | - | |
| 270 | - if not writeback: | |
| 271 | - return self.dict_data | |
| 272 | - else: | |
| 273 | - try: | |
| 274 | - with self.table.batch(batch_size=5000) as b: | |
| 275 | - for imgname, imginfo in self.dict_data.items(): | |
| 276 | - b.put(imgname, | |
| 277 | - { | |
| 278 | - 'cf_pic:data': imginfo[0], | |
| 279 | - 'cf_info:width': imginfo[1], | |
| 280 | - 'cf_info:height': imginfo[2], | |
| 281 | - 'cf_info:size': imginfo[3], | |
| 282 | - 'cf_info:capacity': imginfo[4], | |
| 283 | - 'cf_info:quality': imginfo[5], | |
| 284 | - 'cf_info:rate': imginfo[6], | |
| 285 | - 'cf_tag:chosen': imginfo[7], | |
| 286 | - 'cf_tag:class': imginfo[8], | |
| 287 | - 'cf_feat:' + feattype: imginfo[9]}) | |
| 288 | - except ValueError: | |
| 289 | - raise | |
| 290 | - | |
| 291 | - elif mode == 'spark': | |
| 292 | - pass | |
| 293 | - else: | |
| 294 | - raise Exception("Unknown mode!") | |
| 295 | - | |
| 296 | - | |
| 297 | - def format(self): | |
| 298 | - self._extract_data(mode='hbase', writeback=False) | |
| 299 | - self._embed_data(mode='hbase', rate=0.1, readforward=False, writeback=False) | |
| 300 | - self._extract_feat(mode='hbase', feattype='ibd', readforward=False, writeback=True) | |
| 301 | - | |
| 302 | - | |
| 303 | - def load_data(self, mode='local', feattype='ibd', tagtype='class'): | |
| 304 | - INDEX = [] | |
| 305 | - X = [] | |
| 306 | - Y = [] | |
| 307 | - | |
| 308 | - if mode == "local": | |
| 309 | - | |
| 310 | - dict_dataset = {} | |
| 311 | - | |
| 312 | - with open(self.list_file, 'rb') as tsvfile: | |
| 313 | - tsvfile = csv.reader(tsvfile, delimiter='\t') | |
| 314 | - for line in tsvfile: | |
| 315 | - hash = line[0] | |
| 316 | - tag = line[-1] | |
| 317 | - path_feat = os.path.join(self.feat_dir, hash[:3], hash[3:] + '.' + feattype) | |
| 318 | - if path_feat: | |
| 319 | - with open(path_feat, 'rb') as featfile: | |
| 320 | - dict_dataset[hash] = (tag, json.loads(featfile.read())) | |
| 321 | - | |
| 322 | - for tag, feat in dict_dataset.values(): | |
| 323 | - X.append([item for sublist in feat for subsublist in sublist for item in subsublist]) | |
| 324 | - Y.append(int(tag)) | |
| 325 | - | |
| 326 | - elif mode == "remote" or mode == "hbase": | |
| 327 | - if self.table == None: | |
| 328 | - self.table = self.get_table() | |
| 329 | - | |
| 330 | - col_feat, col_tag = 'cf_feat:' + feattype, 'cf_tag:' + tagtype | |
| 331 | - for key, data in self.table.scan(columns=[col_feat, col_tag]): | |
| 332 | - X.append(json.loads(data[col_feat])) | |
| 333 | - Y.append(1 if data[col_tag] == 'True' else 0) | |
| 334 | - | |
| 335 | - elif mode == "spark" or mode == "cluster": | |
| 336 | - if self.sparkcontex == None: | |
| 337 | - self.sparkcontex = SC.Sparker(host='HPC-server', appname='ImageCV', master='spark://HPC-server:7077') | |
| 338 | - | |
| 339 | - result = self.sparkcontex.read_hbase(self.table_name) # result = {key:[feat,tag],...} | |
| 340 | - for feat, tag in result: | |
| 341 | - X.append(feat) | |
| 342 | - Y.append(tag) | |
| 343 | - | |
| 344 | - else: | |
| 345 | - raise Exception("Unknown mode!") | |
| 346 | - | |
| 347 | - return X, Y | |
| 348 | - | |
| 349 | - | |
| 350 | - | |
| 351 | - | |
| 352 | - | |
| 353 | - | |
| 354 | - | |
| 355 | - | |
| 356 | - | |
| 357 | - | |
| 358 | - | |
| 359 | - |
mdata/ILSVRC.py
| ... | ... | @@ -302,7 +302,7 @@ class DataILSVRC(DataDumperBase): |
| 302 | 302 | tsvfile = csv.reader(tsvfile, delimiter='\t') |
| 303 | 303 | for line in tsvfile: |
| 304 | 304 | path_img = os.path.join(self.img_dir, line[0][:3], line[0][3:] + '.jpg') |
| 305 | - if path_img: | |
| 305 | + if path_img: | |
| 306 | 306 | with open(path_img, 'rb') as fpic: |
| 307 | 307 | dict_databuf[line[0] + '.jpg'] = fpic.read() |
| 308 | 308 | ... | ... |
| ... | ... | @@ -0,0 +1,367 @@ |
| 1 | +__author__ = 'chunk' | |
| 2 | + | |
| 3 | +from . import * | |
| 4 | +from ..mfeat import HOG, IntraBlockDiff | |
| 5 | +from ..mspark import SC | |
| 6 | +from ..common import * | |
| 7 | + | |
| 8 | +import os, sys | |
| 9 | +from PIL import Image | |
| 10 | +from hashlib import md5 | |
| 11 | +import csv | |
| 12 | +import shutil | |
| 13 | +import json | |
| 14 | +import collections | |
| 15 | +import happybase | |
| 16 | + | |
| 17 | +from ..mjpeg import * | |
| 18 | +from ..msteg import * | |
| 19 | +from ..msteg.steganography import LSB, F3, F4, F5 | |
| 20 | + | |
| 21 | +import numpy as np | |
| 22 | +from numpy.random import randn | |
| 23 | +import pandas as pd | |
| 24 | +from scipy import stats | |
| 25 | + | |
| 26 | +from subprocess import Popen, PIPE, STDOUT | |
| 27 | +import tempfile | |
| 28 | + | |
| 29 | +np.random.seed(sum(map(ord, "whoami"))) | |
| 30 | + | |
| 31 | +package_dir = os.path.dirname(os.path.abspath(__file__)) | |
| 32 | + | |
| 33 | + | |
| 34 | +class DataILSVRC_S(DataDumperBase): | |
| 35 | + """ | |
| 36 | + This module is specially for ILSVRC data processing under spark & hbase. | |
| 37 | + | |
| 38 | + We posit that the DB(e.g. HBase) has only the images data with md5 name as id. | |
| 39 | + The task is to gennerate info(size,capacity,quality,etc.) and class & chosen tags, and then to perform embedding and finally to calcculate ibd features. | |
| 40 | + | |
| 41 | + Each step includes reading from & writing to Hbase (though PC). | |
| 42 | + And each step must have a 'spark' mode option, which means that the operation is performed by spark with reading & wrting through RDDs. | |
| 43 | + | |
| 44 | + chunkplus@gmail.com | |
| 45 | + """ | |
| 46 | + | |
| 47 | + def __init__(self, base_dir='/media/chunk/Elements/D/data/ImageNet/img/ILSVRC2013_DET_val', category='Train'): | |
| 48 | + DataDumperBase.__init__(self, base_dir, category) | |
| 49 | + | |
| 50 | + self.base_dir = base_dir | |
| 51 | + self.category = category | |
| 52 | + | |
| 53 | + self.dict_data = {} | |
| 54 | + | |
| 55 | + self.table_name = self.base_dir.strip('/').split('/')[-1] + '-' + self.category | |
| 56 | + self.sparkcontex = None | |
| 57 | + | |
| 58 | + | |
| 59 | + def get_table(self): | |
| 60 | + if self.table != None: | |
| 61 | + return self.table | |
| 62 | + | |
| 63 | + if self.connection is None: | |
| 64 | + c = happybase.Connection('HPC-server') | |
| 65 | + self.connection = c | |
| 66 | + | |
| 67 | + tables = self.connection.tables() | |
| 68 | + if self.table_name not in tables: | |
| 69 | + families = {'cf_pic': dict(), | |
| 70 | + 'cf_info': dict(max_versions=10), | |
| 71 | + 'cf_tag': dict(), | |
| 72 | + 'cf_feat': dict(), | |
| 73 | + } | |
| 74 | + self.connection.create_table(name=self.table_name, families=families) | |
| 75 | + | |
| 76 | + table = self.connection.table(name=self.table_name) | |
| 77 | + | |
| 78 | + self.table = table | |
| 79 | + | |
| 80 | + return table | |
| 81 | + | |
| 82 | + def _get_info(self, img, info_rate=None, tag_chosen=None, tag_class=None): | |
| 83 | + """ | |
| 84 | + Tempfile is our friend. (?) | |
| 85 | + """ | |
| 86 | + info_rate = info_rate if info_rate != None else 0.0 | |
| 87 | + tag_chosen = tag_chosen if tag_chosen != None else stats.bernoulli.rvs(0.8) | |
| 88 | + tag_class = tag_class if tag_class != None else 0 | |
| 89 | + try: | |
| 90 | + tmpf = tempfile.NamedTemporaryFile(suffix='.jpg', mode='w+b') | |
| 91 | + tmpf.write(img) | |
| 92 | + tmpf.seek(0) | |
| 93 | + im = Jpeg(tmpf.name, key=sample_key) | |
| 94 | + info = [str(im.image_width), | |
| 95 | + str(im.image_height), | |
| 96 | + str(im.image_width * im.image_height), | |
| 97 | + str(im.getCapacity()), | |
| 98 | + str(im.getQuality()), | |
| 99 | + str(info_rate), | |
| 100 | + str(tag_chosen), | |
| 101 | + str(tag_class)] | |
| 102 | + return info | |
| 103 | + except Exception as e: | |
| 104 | + print e | |
| 105 | + finally: | |
| 106 | + tmpf.close() | |
| 107 | + | |
| 108 | + def _get_feat(self, image, feattype='ibd', **kwargs): | |
| 109 | + size = kwargs.get('size', (48, 48)) | |
| 110 | + | |
| 111 | + if feattype == 'hog': | |
| 112 | + feater = HOG.FeatHOG(size=size) | |
| 113 | + elif feattype == 'ibd': | |
| 114 | + feater = IntraBlockDiff.FeatIntraBlockDiff() | |
| 115 | + else: | |
| 116 | + raise Exception("Unknown feature type!") | |
| 117 | + | |
| 118 | + desc = feater.feat(image) | |
| 119 | + | |
| 120 | + return desc | |
| 121 | + | |
| 122 | + def _extract_data(self, mode='hbase', writeback=False): | |
| 123 | + """ | |
| 124 | + Get info barely out of image data. | |
| 125 | + """ | |
| 126 | + if mode == 'hbase': | |
| 127 | + if self.table == None: | |
| 128 | + self.table = self.get_table() | |
| 129 | + | |
| 130 | + cols = ['cf_pic:data'] | |
| 131 | + for key, data in self.table.scan(columns=cols): | |
| 132 | + data = data['cf_pic:data'] | |
| 133 | + self.dict_data[key] = [data] + self._get_info(data) | |
| 134 | + | |
| 135 | + if not writeback: | |
| 136 | + return self.dict_data | |
| 137 | + else: | |
| 138 | + try: | |
| 139 | + with self.table.batch(batch_size=5000) as b: | |
| 140 | + for imgname, imginfo in self.dict_data.items(): | |
| 141 | + b.put(imgname, | |
| 142 | + { | |
| 143 | + # 'cf_pic:data': imginfo[0], | |
| 144 | + 'cf_info:width': imginfo[1], | |
| 145 | + 'cf_info:height': imginfo[2], | |
| 146 | + 'cf_info:size': imginfo[3], | |
| 147 | + 'cf_info:capacity': imginfo[4], | |
| 148 | + 'cf_info:quality': imginfo[5], | |
| 149 | + 'cf_info:rate': imginfo[6], | |
| 150 | + 'cf_tag:chosen': imginfo[7], | |
| 151 | + 'cf_tag:class': imginfo[8], | |
| 152 | + }) | |
| 153 | + except ValueError: | |
| 154 | + raise | |
| 155 | + | |
| 156 | + | |
| 157 | + elif mode == 'spark': | |
| 158 | + pass | |
| 159 | + else: | |
| 160 | + raise Exception("Unknown mode!") | |
| 161 | + | |
| 162 | + | |
| 163 | + def _embed_data(self, mode='hbase', rate=None, readforward=False, writeback=False): | |
| 164 | + f5 = F5.F5(sample_key, 1) | |
| 165 | + if mode == 'hbase': | |
| 166 | + if self.table == None: | |
| 167 | + self.table = self.get_table() | |
| 168 | + | |
| 169 | + if readforward: | |
| 170 | + self.dict_data = {} | |
| 171 | + cols = ['cf_pic:data', | |
| 172 | + 'cf_info:width', | |
| 173 | + 'cf_info:height', | |
| 174 | + 'cf_info:size', | |
| 175 | + 'cf_info:capacity', | |
| 176 | + 'cf_info:quality', | |
| 177 | + 'cf_info:rate', | |
| 178 | + 'cf_tag:chosen', | |
| 179 | + 'cf_tag:class'] | |
| 180 | + for key, data in self.table.scan(columns=cols): | |
| 181 | + data = [data[k] for k in cols] | |
| 182 | + self.dict_data[key] = data | |
| 183 | + | |
| 184 | + dict_data_ext = {} | |
| 185 | + | |
| 186 | + for imgname, imgdata in self.dict_data.items(): | |
| 187 | + try: | |
| 188 | + tmpf_src = tempfile.NamedTemporaryFile(suffix='.jpg', mode='w+b') | |
| 189 | + tmpf_src.write(imgdata[0]) | |
| 190 | + tmpf_src.seek(0) | |
| 191 | + tmpf_dst = tempfile.NamedTemporaryFile(suffix='.jpg', mode='w+b') | |
| 192 | + | |
| 193 | + if rate == None: | |
| 194 | + embed_rate = f5.embed_raw_data(tmpf_src.name, os.path.join(package_dir, '../res/toembed'), tmpf_dst.name) | |
| 195 | + else: | |
| 196 | + assert (rate >= 0 and rate < 1) | |
| 197 | + # print capacity | |
| 198 | + hidden = np.random.bytes(int(int(imgdata[4]) * rate) / 8) | |
| 199 | + embed_rate = f5.embed_raw_data(tmpf_src.name, hidden, tmpf_dst.name, frommem=True) | |
| 200 | + | |
| 201 | + tmpf_dst.seek(0) | |
| 202 | + raw = tmpf_dst.read() | |
| 203 | + index = md5(raw).hexdigest() | |
| 204 | + dict_data_ext[index + '.jpg'] = [raw] + self._get_info(raw, embed_rate, 0, 1) | |
| 205 | + | |
| 206 | + | |
| 207 | + except Exception as e: | |
| 208 | + print e | |
| 209 | + raise | |
| 210 | + finally: | |
| 211 | + tmpf_src.close() | |
| 212 | + tmpf_dst.close() | |
| 213 | + | |
| 214 | + self.dict_data.update(dict_data_ext) | |
| 215 | + | |
| 216 | + if not writeback: | |
| 217 | + return self.dict_data | |
| 218 | + else: | |
| 219 | + try: | |
| 220 | + with self.table.batch(batch_size=5000) as b: | |
| 221 | + for imgname, imginfo in dict_data_ext.items(): | |
| 222 | + b.put(imgname, | |
| 223 | + { | |
| 224 | + 'cf_pic:data': imginfo[0], | |
| 225 | + 'cf_info:width': imginfo[1], | |
| 226 | + 'cf_info:height': imginfo[2], | |
| 227 | + 'cf_info:size': imginfo[3], | |
| 228 | + 'cf_info:capacity': imginfo[4], | |
| 229 | + 'cf_info:quality': imginfo[5], | |
| 230 | + 'cf_info:rate': imginfo[6], | |
| 231 | + 'cf_tag:chosen': imginfo[7], | |
| 232 | + 'cf_tag:class': imginfo[8], }) | |
| 233 | + except ValueError: | |
| 234 | + raise | |
| 235 | + | |
| 236 | + elif mode == 'spark': | |
| 237 | + pass | |
| 238 | + else: | |
| 239 | + raise Exception("Unknown mode!") | |
| 240 | + | |
| 241 | + | |
| 242 | + def _extract_feat(self, mode='hbase', feattype='ibd', readforward=False, writeback=False, **kwargs): | |
| 243 | + if mode == 'hbase': | |
| 244 | + if self.table == None: | |
| 245 | + self.table = self.get_table() | |
| 246 | + | |
| 247 | + if readforward: | |
| 248 | + self.dict_data = {} | |
| 249 | + cols = ['cf_pic:data', | |
| 250 | + 'cf_info:width', | |
| 251 | + 'cf_info:height', | |
| 252 | + 'cf_info:size', | |
| 253 | + 'cf_info:capacity', | |
| 254 | + 'cf_info:quality', | |
| 255 | + 'cf_info:rate', | |
| 256 | + 'cf_tag:chosen', | |
| 257 | + 'cf_tag:class'] | |
| 258 | + for key, data in self.table.scan(columns=cols): | |
| 259 | + data = [data[k] for k in cols] | |
| 260 | + self.dict_data[key] = data | |
| 261 | + | |
| 262 | + for imgname, imgdata in self.dict_data.items(): | |
| 263 | + try: | |
| 264 | + tmpf_src = tempfile.NamedTemporaryFile(suffix='.jpg', mode='w+b') | |
| 265 | + tmpf_src.write(imgdata[0]) | |
| 266 | + tmpf_src.seek(0) | |
| 267 | + | |
| 268 | + desc = json.dumps(self._get_feat(tmpf_src.name, feattype=feattype).tolist()) | |
| 269 | + | |
| 270 | + self.dict_data[imgname].append(desc) | |
| 271 | + | |
| 272 | + except Exception as e: | |
| 273 | + print e | |
| 274 | + raise | |
| 275 | + finally: | |
| 276 | + tmpf_src.close() | |
| 277 | + | |
| 278 | + if not writeback: | |
| 279 | + return self.dict_data | |
| 280 | + else: | |
| 281 | + try: | |
| 282 | + with self.table.batch(batch_size=5000) as b: | |
| 283 | + for imgname, imginfo in self.dict_data.items(): | |
| 284 | + b.put(imgname, | |
| 285 | + { | |
| 286 | + 'cf_pic:data': imginfo[0], | |
| 287 | + 'cf_info:width': imginfo[1], | |
| 288 | + 'cf_info:height': imginfo[2], | |
| 289 | + 'cf_info:size': imginfo[3], | |
| 290 | + 'cf_info:capacity': imginfo[4], | |
| 291 | + 'cf_info:quality': imginfo[5], | |
| 292 | + 'cf_info:rate': imginfo[6], | |
| 293 | + 'cf_tag:chosen': imginfo[7], | |
| 294 | + 'cf_tag:class': imginfo[8], | |
| 295 | + 'cf_feat:' + feattype: imginfo[9]}) | |
| 296 | + except ValueError: | |
| 297 | + raise | |
| 298 | + | |
| 299 | + elif mode == 'spark': | |
| 300 | + pass | |
| 301 | + else: | |
| 302 | + raise Exception("Unknown mode!") | |
| 303 | + | |
| 304 | + | |
| 305 | + def format(self): | |
| 306 | + self._extract_data(mode='hbase', writeback=False) | |
| 307 | + self._embed_data(mode='hbase', rate=0.1, readforward=False, writeback=False) | |
| 308 | + self._extract_feat(mode='hbase', feattype='ibd', readforward=False, writeback=True) | |
| 309 | + | |
| 310 | + | |
| 311 | + def load_data(self, mode='local', feattype='ibd', tagtype='class'): | |
| 312 | + INDEX = [] | |
| 313 | + X = [] | |
| 314 | + Y = [] | |
| 315 | + | |
| 316 | + if mode == "local": | |
| 317 | + | |
| 318 | + dict_dataset = {} | |
| 319 | + | |
| 320 | + with open(self.list_file, 'rb') as tsvfile: | |
| 321 | + tsvfile = csv.reader(tsvfile, delimiter='\t') | |
| 322 | + for line in tsvfile: | |
| 323 | + hash = line[0] | |
| 324 | + tag = line[-1] | |
| 325 | + path_feat = os.path.join(self.feat_dir, hash[:3], hash[3:] + '.' + feattype) | |
| 326 | + if path_feat: | |
| 327 | + with open(path_feat, 'rb') as featfile: | |
| 328 | + dict_dataset[hash] = (tag, json.loads(featfile.read())) | |
| 329 | + | |
| 330 | + for tag, feat in dict_dataset.values(): | |
| 331 | + X.append([item for sublist in feat for subsublist in sublist for item in subsublist]) | |
| 332 | + Y.append(int(tag)) | |
| 333 | + | |
| 334 | + elif mode == "remote" or mode == "hbase": | |
| 335 | + if self.table == None: | |
| 336 | + self.table = self.get_table() | |
| 337 | + | |
| 338 | + col_feat, col_tag = 'cf_feat:' + feattype, 'cf_tag:' + tagtype | |
| 339 | + for key, data in self.table.scan(columns=[col_feat, col_tag]): | |
| 340 | + X.append(json.loads(data[col_feat])) | |
| 341 | + Y.append(1 if data[col_tag] == 'True' else 0) | |
| 342 | + | |
| 343 | + elif mode == "spark" or mode == "cluster": | |
| 344 | + if self.sparkcontex == None: | |
| 345 | + self.sparkcontex = SC.Sparker(host='HPC-server', appname='ImageCV', master='spark://HPC-server:7077') | |
| 346 | + | |
| 347 | + result = self.sparkcontex.read_hbase(self.table_name) # result = {key:[feat,tag],...} | |
| 348 | + for feat, tag in result: | |
| 349 | + X.append(feat) | |
| 350 | + Y.append(tag) | |
| 351 | + | |
| 352 | + else: | |
| 353 | + raise Exception("Unknown mode!") | |
| 354 | + | |
| 355 | + return X, Y | |
| 356 | + | |
| 357 | + | |
| 358 | + | |
| 359 | + | |
| 360 | + | |
| 361 | + | |
| 362 | + | |
| 363 | + | |
| 364 | + | |
| 365 | + | |
| 366 | + | |
| 367 | + | ... | ... |
test/test_data.py
| ... | ... | @@ -2,7 +2,7 @@ __author__ = 'chunk' |
| 2 | 2 | |
| 3 | 3 | from ..common import * |
| 4 | 4 | |
| 5 | -from ..mdata import MSR, CV, ILSVRC | |
| 5 | +from ..mdata import MSR, CV, ILSVRC, ILSVRC_S | |
| 6 | 6 | |
| 7 | 7 | |
| 8 | 8 | def test_MSR(): |
| ... | ... | @@ -31,30 +31,38 @@ def test_CV(): |
| 31 | 31 | def test_ILSVRC(): |
| 32 | 32 | timer = Timer() |
| 33 | 33 | # dil = ILSVRC.DataILSVRC(base_dir='/data/hadoop/ImageNet/ILSVRC/ILSVRC2013_DET_val', category='Train') |
| 34 | - dil = ILSVRC.DataILSVRC(base_dir='/data/hadoop/ImageNet/ILSVRC/ILSVRC2013_DET_val', category='Train_1') | |
| 34 | + dil = ILSVRC.DataILSVRC(base_dir='/data/hadoop/ImageNet/ILSVRC/ILSVRC2013_DET_val', category='Train_2') | |
| 35 | 35 | # dil = ILSVRC.DataILSVRC(base_dir='/media/chunk/Elements/D/data/ImageNet/img/ILSVRC2013_DET_val', category='Train_1') |
| 36 | 36 | |
| 37 | 37 | dil.format() |
| 38 | - dil.embed(rate=0.1) | |
| 39 | - dil.extract_feat(feattype='ibd') | |
| 38 | + # dil.embed(rate=0.1) | |
| 39 | + # dil.extract_feat(feattype='ibd') | |
| 40 | 40 | # dil.extract_feat(feattype='hog') |
| 41 | 41 | |
| 42 | 42 | timer.mark() |
| 43 | 43 | dil.store_img() |
| 44 | 44 | timer.report() |
| 45 | - | |
| 46 | - timer.mark() | |
| 47 | - dil.store_tag() | |
| 48 | - timer.report() | |
| 49 | 45 | |
| 50 | - timer.mark() | |
| 51 | - dil.store_info() | |
| 52 | - timer.report() | |
| 46 | + # timer.mark() | |
| 47 | + # dil.store_tag() | |
| 48 | + # timer.report() | |
| 49 | + # | |
| 50 | + # timer.mark() | |
| 51 | + # dil.store_info() | |
| 52 | + # timer.report() | |
| 53 | + # | |
| 54 | + # timer.mark() | |
| 55 | + # dil.store_feat() | |
| 56 | + # timer.report() | |
| 53 | 57 | |
| 54 | - timer.mark() | |
| 55 | - dil.store_feat() | |
| 56 | - timer.report() | |
| 57 | 58 | |
| 59 | +def test_ILSVRC_S(): | |
| 60 | + timer = Timer() | |
| 61 | + dils = ILSVRC_S.DataILSVRC_S(base_dir='/data/hadoop/ImageNet/ILSVRC/ILSVRC2013_DET_val', category='Train_2') | |
| 62 | + | |
| 63 | + dils._extract_data(mode='hbase', writeback=True) | |
| 64 | + dils._embed_data(mode='hbase', rate=0.1, readforward=True, writeback=True) | |
| 65 | + dils._extract_feat( mode='hbase', feattype='ibd', readforward=True, writeback=True) | |
| 58 | 66 | |
| 59 | 67 | if __name__ == '__main__': |
| 60 | 68 | # test_MSR() | ... | ... |