Commit 24768a990fbda84a2df0b387178ce4039863d423
1 parent
f25fd27c
Exists in
master
and in
1 other branch
mode 'hbase' finished. (testing is sometimes interesting, especially when your c…
…ode is well structured and with few bugs! )
Showing
4 changed files
with
390 additions
and
374 deletions
Show diff stats
mdata/ILSVRC-S.py
| @@ -1,359 +0,0 @@ | @@ -1,359 +0,0 @@ | ||
| 1 | -__author__ = 'chunk' | ||
| 2 | - | ||
| 3 | -from . import * | ||
| 4 | -from ..mfeat import HOG, IntraBlockDiff | ||
| 5 | -from ..mspark import SC | ||
| 6 | -from ..common import * | ||
| 7 | - | ||
| 8 | -import os, sys | ||
| 9 | -from PIL import Image | ||
| 10 | -from hashlib import md5 | ||
| 11 | -import csv | ||
| 12 | -import shutil | ||
| 13 | -import json | ||
| 14 | -import collections | ||
| 15 | -import happybase | ||
| 16 | - | ||
| 17 | -from ..mjpeg import * | ||
| 18 | -from ..msteg import * | ||
| 19 | -from ..msteg.steganography import LSB, F3, F4, F5 | ||
| 20 | - | ||
| 21 | -import numpy as np | ||
| 22 | -from numpy.random import randn | ||
| 23 | -import pandas as pd | ||
| 24 | -from scipy import stats | ||
| 25 | - | ||
| 26 | -from subprocess import Popen, PIPE, STDOUT | ||
| 27 | -import tempfile | ||
| 28 | - | ||
| 29 | -np.random.seed(sum(map(ord, "whoami"))) | ||
| 30 | - | ||
| 31 | -package_dir = os.path.dirname(os.path.abspath(__file__)) | ||
| 32 | - | ||
| 33 | - | ||
| 34 | -class DataILSVRCS(DataDumperBase): | ||
| 35 | - """ | ||
| 36 | - This module is specially for ILSVRC data processing under spark & hbase. | ||
| 37 | - | ||
| 38 | - We posit that the DB(e.g. HBase) has only the images data with md5 name as id. | ||
| 39 | - The task is to gennerate info(size,capacity,quality,etc.) and class & chosen tags, and then to perform embedding and finally to calcculate ibd features. | ||
| 40 | - | ||
| 41 | - Each step includes reading from & writing to Hbase (though PC). | ||
| 42 | - And each step must have a 'spark' mode option, which means that the operation is performed by spark with reading & wrting through RDDs. | ||
| 43 | - | ||
| 44 | - chunkplus@gmail.com | ||
| 45 | - """ | ||
| 46 | - | ||
| 47 | - def __init__(self, base_dir='/media/chunk/Elements/D/data/ImageNet/img/ILSVRC2013_DET_val', category='Train'): | ||
| 48 | - DataDumperBase.__init__(self, base_dir, category) | ||
| 49 | - | ||
| 50 | - self.base_dir = base_dir | ||
| 51 | - self.category = category | ||
| 52 | - | ||
| 53 | - self.dict_data = {} | ||
| 54 | - | ||
| 55 | - self.table_name = self.base_dir.strip('/').split('/')[-1] + '-' + self.category | ||
| 56 | - self.sparkcontex = None | ||
| 57 | - | ||
| 58 | - | ||
| 59 | - def _get_table(self): | ||
| 60 | - if self.table != None: | ||
| 61 | - return self.table | ||
| 62 | - | ||
| 63 | - if self.connection is None: | ||
| 64 | - c = happybase.Connection('HPC-server') | ||
| 65 | - self.connection = c | ||
| 66 | - | ||
| 67 | - tables = self.connection.tables() | ||
| 68 | - if self.table_name not in tables: | ||
| 69 | - families = {'cf_pic': dict(), | ||
| 70 | - 'cf_info': dict(max_versions=10), | ||
| 71 | - 'cf_tag': dict(), | ||
| 72 | - 'cf_feat': dict(), | ||
| 73 | - } | ||
| 74 | - self.connection.create_table(name=self.table_name, families=families) | ||
| 75 | - | ||
| 76 | - table = self.connection.table(name=self.table_name) | ||
| 77 | - | ||
| 78 | - self.table = table | ||
| 79 | - | ||
| 80 | - return table | ||
| 81 | - | ||
| 82 | - def _get_info(self, img, info_rate=None, tag_chosen=None, tag_class=None): | ||
| 83 | - """ | ||
| 84 | - Tempfile is our friend. (?) | ||
| 85 | - """ | ||
| 86 | - info_rate = info_rate if info_rate != None else 0.0 | ||
| 87 | - tag_chosen = tag_chosen if tag_chosen != None else stats.bernoulli.rvs(0.8) | ||
| 88 | - tag_class = tag_class if tag_class != None else 0 | ||
| 89 | - try: | ||
| 90 | - tmpf = tempfile.NamedTemporaryFile(suffix='.jpg', mode='w+b') | ||
| 91 | - tmpf.write(img) | ||
| 92 | - # tmpf.seek(0) | ||
| 93 | - im = Jpeg(tmpf.name, key=sample_key) | ||
| 94 | - info = [im.image_width, | ||
| 95 | - im.image_height, | ||
| 96 | - im.image_width * im.image_height, | ||
| 97 | - im.getCapacity(), | ||
| 98 | - im.getQuality(), | ||
| 99 | - info_rate, | ||
| 100 | - tag_chosen, | ||
| 101 | - tag_class] | ||
| 102 | - return info | ||
| 103 | - except Exception as e: | ||
| 104 | - print e | ||
| 105 | - finally: | ||
| 106 | - tmpf.close() | ||
| 107 | - | ||
| 108 | - def _get_feat(self, image, feattype='ibd', **kwargs): | ||
| 109 | - size = kwargs.get('size', (48, 48)) | ||
| 110 | - | ||
| 111 | - if feattype == 'hog': | ||
| 112 | - feater = HOG.FeatHOG(size=size) | ||
| 113 | - elif feattype == 'ibd': | ||
| 114 | - feater = IntraBlockDiff.FeatIntraBlockDiff() | ||
| 115 | - else: | ||
| 116 | - raise Exception("Unknown feature type!") | ||
| 117 | - | ||
| 118 | - desc = feater.feat(image) | ||
| 119 | - | ||
| 120 | - return desc | ||
| 121 | - | ||
| 122 | - def _extract_data(self, mode='hbase', writeback=False): | ||
| 123 | - """ | ||
| 124 | - Get info barely out of image data. | ||
| 125 | - """ | ||
| 126 | - if mode == 'hbase': | ||
| 127 | - if self.table == None: | ||
| 128 | - self.table = self.get_table() | ||
| 129 | - | ||
| 130 | - cols = ['cf_pic:data'] | ||
| 131 | - for key, data in self.table.scan(columns=cols, scan_batching=True): | ||
| 132 | - self.dict_data[key] = [data] + self._get_info(data) | ||
| 133 | - | ||
| 134 | - if not writeback: | ||
| 135 | - return self.dict_data | ||
| 136 | - else: | ||
| 137 | - try: | ||
| 138 | - with self.table.batch(batch_size=5000) as b: | ||
| 139 | - for imgname, imginfo in self.dict_data.items(): | ||
| 140 | - b.put(imgname, | ||
| 141 | - { | ||
| 142 | - # 'cf_pic:data': imginfo[0], | ||
| 143 | - 'cf_info:width': imginfo[1], | ||
| 144 | - 'cf_info:height': imginfo[2], | ||
| 145 | - 'cf_info:size': imginfo[3], | ||
| 146 | - 'cf_info:capacity': imginfo[4], | ||
| 147 | - 'cf_info:quality': imginfo[5], | ||
| 148 | - 'cf_info:rate': imginfo[6], | ||
| 149 | - 'cf_tag:chosen': imginfo[7], | ||
| 150 | - 'cf_tag:class': imginfo[8], }) | ||
| 151 | - except ValueError: | ||
| 152 | - raise | ||
| 153 | - | ||
| 154 | - | ||
| 155 | - elif mode == 'spark': | ||
| 156 | - pass | ||
| 157 | - else: | ||
| 158 | - raise Exception("Unknown mode!") | ||
| 159 | - | ||
| 160 | - | ||
| 161 | - def _embed_data(self, mode='hbase', rate=None, readforward=False, writeback=False): | ||
| 162 | - f5 = F5.F5(sample_key, 1) | ||
| 163 | - if mode == 'hbase': | ||
| 164 | - if self.table == None: | ||
| 165 | - self.table = self.get_table() | ||
| 166 | - | ||
| 167 | - if readforward: | ||
| 168 | - self.dict_data = {} | ||
| 169 | - cols = ['cf_pic:data', | ||
| 170 | - 'cf_info:width', | ||
| 171 | - 'cf_info:height', | ||
| 172 | - 'cf_info:size', | ||
| 173 | - 'cf_info:capacity', | ||
| 174 | - 'cf_info:quality', | ||
| 175 | - 'cf_info:rate', | ||
| 176 | - 'cf_tag:chosen', | ||
| 177 | - 'cf_tag:class'] | ||
| 178 | - for key, data in self.table.scan(columns=cols, scan_batching=True): | ||
| 179 | - self.dict_data[key] = data | ||
| 180 | - | ||
| 181 | - dict_data_ext = {} | ||
| 182 | - | ||
| 183 | - for imgname, imgdata in self.dict_data.items(): | ||
| 184 | - try: | ||
| 185 | - tmpf_src = tempfile.NamedTemporaryFile(suffix='.jpg', mode='w+b') | ||
| 186 | - tmpf_src.write(imgdata[0]) | ||
| 187 | - tmpf_dst = tempfile.NamedTemporaryFile(suffix='.jpg', mode='w+b') | ||
| 188 | - | ||
| 189 | - if rate == None: | ||
| 190 | - embed_rate = f5.embed_raw_data(tmpf_src, os.path.join(package_dir, '../res/toembed'), tmpf_dst) | ||
| 191 | - else: | ||
| 192 | - assert (rate >= 0 and rate < 1) | ||
| 193 | - # print capacity | ||
| 194 | - hidden = np.random.bytes(int(imgdata[4] * rate) / 8) | ||
| 195 | - embed_rate = f5.embed_raw_data(tmpf_src, hidden, tmpf_dst, frommem=True) | ||
| 196 | - | ||
| 197 | - tmpf_dst.seek(0) | ||
| 198 | - raw = tmpf_dst.read() | ||
| 199 | - index = md5(raw).hexdigest() | ||
| 200 | - dict_data_ext[index + '.jpg'] = [raw] + self._get_info(raw, embed_rate, 0, 1) | ||
| 201 | - | ||
| 202 | - | ||
| 203 | - except Exception as e: | ||
| 204 | - print e | ||
| 205 | - finally: | ||
| 206 | - tmpf_src.close() | ||
| 207 | - tmpf_dst.close() | ||
| 208 | - | ||
| 209 | - self.dict_data.update(dict_data_ext) | ||
| 210 | - | ||
| 211 | - if not writeback: | ||
| 212 | - return self.dict_data | ||
| 213 | - else: | ||
| 214 | - try: | ||
| 215 | - with self.table.batch(batch_size=5000) as b: | ||
| 216 | - for imgname, imginfo in dict_data_ext.items(): | ||
| 217 | - b.put(imgname, | ||
| 218 | - { | ||
| 219 | - 'cf_pic:data': imginfo[0], | ||
| 220 | - 'cf_info:width': imginfo[1], | ||
| 221 | - 'cf_info:height': imginfo[2], | ||
| 222 | - 'cf_info:size': imginfo[3], | ||
| 223 | - 'cf_info:capacity': imginfo[4], | ||
| 224 | - 'cf_info:quality': imginfo[5], | ||
| 225 | - 'cf_info:rate': imginfo[6], | ||
| 226 | - 'cf_tag:chosen': imginfo[7], | ||
| 227 | - 'cf_tag:class': imginfo[8], }) | ||
| 228 | - except ValueError: | ||
| 229 | - raise | ||
| 230 | - | ||
| 231 | - elif mode == 'spark': | ||
| 232 | - pass | ||
| 233 | - else: | ||
| 234 | - raise Exception("Unknown mode!") | ||
| 235 | - | ||
| 236 | - | ||
| 237 | - def _extract_feat(self, mode='hbase', feattype='ibd', readforward=False, writeback=False, **kwargs): | ||
| 238 | - if mode == 'hbase': | ||
| 239 | - if self.table == None: | ||
| 240 | - self.table = self.get_table() | ||
| 241 | - | ||
| 242 | - if readforward: | ||
| 243 | - self.dict_data = {} | ||
| 244 | - cols = ['cf_pic:data', | ||
| 245 | - 'cf_info:width', | ||
| 246 | - 'cf_info:height', | ||
| 247 | - 'cf_info:size', | ||
| 248 | - 'cf_info:capacity', | ||
| 249 | - 'cf_info:quality', | ||
| 250 | - 'cf_info:rate', | ||
| 251 | - 'cf_tag:chosen', | ||
| 252 | - 'cf_tag:class'] | ||
| 253 | - for key, data in self.table.scan(columns=cols, scan_batching=True): | ||
| 254 | - self.dict_data[key] = data | ||
| 255 | - | ||
| 256 | - for imgname, imgdata in self.dict_data.items(): | ||
| 257 | - try: | ||
| 258 | - tmpf_src = tempfile.NamedTemporaryFile(suffix='.jpg', mode='w+b') | ||
| 259 | - tmpf_src.write(imgdata[0]) | ||
| 260 | - | ||
| 261 | - desc = json.dumps(self._get_feat(tmpf_src, feattype=feattype)) | ||
| 262 | - | ||
| 263 | - self.dict_data[imgname].append(desc) | ||
| 264 | - | ||
| 265 | - except Exception as e: | ||
| 266 | - print e | ||
| 267 | - finally: | ||
| 268 | - tmpf_src.close() | ||
| 269 | - | ||
| 270 | - if not writeback: | ||
| 271 | - return self.dict_data | ||
| 272 | - else: | ||
| 273 | - try: | ||
| 274 | - with self.table.batch(batch_size=5000) as b: | ||
| 275 | - for imgname, imginfo in self.dict_data.items(): | ||
| 276 | - b.put(imgname, | ||
| 277 | - { | ||
| 278 | - 'cf_pic:data': imginfo[0], | ||
| 279 | - 'cf_info:width': imginfo[1], | ||
| 280 | - 'cf_info:height': imginfo[2], | ||
| 281 | - 'cf_info:size': imginfo[3], | ||
| 282 | - 'cf_info:capacity': imginfo[4], | ||
| 283 | - 'cf_info:quality': imginfo[5], | ||
| 284 | - 'cf_info:rate': imginfo[6], | ||
| 285 | - 'cf_tag:chosen': imginfo[7], | ||
| 286 | - 'cf_tag:class': imginfo[8], | ||
| 287 | - 'cf_feat:' + feattype: imginfo[9]}) | ||
| 288 | - except ValueError: | ||
| 289 | - raise | ||
| 290 | - | ||
| 291 | - elif mode == 'spark': | ||
| 292 | - pass | ||
| 293 | - else: | ||
| 294 | - raise Exception("Unknown mode!") | ||
| 295 | - | ||
| 296 | - | ||
| 297 | - def format(self): | ||
| 298 | - self._extract_data(mode='hbase', writeback=False) | ||
| 299 | - self._embed_data(mode='hbase', rate=0.1, readforward=False, writeback=False) | ||
| 300 | - self._extract_feat(mode='hbase', feattype='ibd', readforward=False, writeback=True) | ||
| 301 | - | ||
| 302 | - | ||
| 303 | - def load_data(self, mode='local', feattype='ibd', tagtype='class'): | ||
| 304 | - INDEX = [] | ||
| 305 | - X = [] | ||
| 306 | - Y = [] | ||
| 307 | - | ||
| 308 | - if mode == "local": | ||
| 309 | - | ||
| 310 | - dict_dataset = {} | ||
| 311 | - | ||
| 312 | - with open(self.list_file, 'rb') as tsvfile: | ||
| 313 | - tsvfile = csv.reader(tsvfile, delimiter='\t') | ||
| 314 | - for line in tsvfile: | ||
| 315 | - hash = line[0] | ||
| 316 | - tag = line[-1] | ||
| 317 | - path_feat = os.path.join(self.feat_dir, hash[:3], hash[3:] + '.' + feattype) | ||
| 318 | - if path_feat: | ||
| 319 | - with open(path_feat, 'rb') as featfile: | ||
| 320 | - dict_dataset[hash] = (tag, json.loads(featfile.read())) | ||
| 321 | - | ||
| 322 | - for tag, feat in dict_dataset.values(): | ||
| 323 | - X.append([item for sublist in feat for subsublist in sublist for item in subsublist]) | ||
| 324 | - Y.append(int(tag)) | ||
| 325 | - | ||
| 326 | - elif mode == "remote" or mode == "hbase": | ||
| 327 | - if self.table == None: | ||
| 328 | - self.table = self.get_table() | ||
| 329 | - | ||
| 330 | - col_feat, col_tag = 'cf_feat:' + feattype, 'cf_tag:' + tagtype | ||
| 331 | - for key, data in self.table.scan(columns=[col_feat, col_tag]): | ||
| 332 | - X.append(json.loads(data[col_feat])) | ||
| 333 | - Y.append(1 if data[col_tag] == 'True' else 0) | ||
| 334 | - | ||
| 335 | - elif mode == "spark" or mode == "cluster": | ||
| 336 | - if self.sparkcontex == None: | ||
| 337 | - self.sparkcontex = SC.Sparker(host='HPC-server', appname='ImageCV', master='spark://HPC-server:7077') | ||
| 338 | - | ||
| 339 | - result = self.sparkcontex.read_hbase(self.table_name) # result = {key:[feat,tag],...} | ||
| 340 | - for feat, tag in result: | ||
| 341 | - X.append(feat) | ||
| 342 | - Y.append(tag) | ||
| 343 | - | ||
| 344 | - else: | ||
| 345 | - raise Exception("Unknown mode!") | ||
| 346 | - | ||
| 347 | - return X, Y | ||
| 348 | - | ||
| 349 | - | ||
| 350 | - | ||
| 351 | - | ||
| 352 | - | ||
| 353 | - | ||
| 354 | - | ||
| 355 | - | ||
| 356 | - | ||
| 357 | - | ||
| 358 | - | ||
| 359 | - |
mdata/ILSVRC.py
| @@ -302,7 +302,7 @@ class DataILSVRC(DataDumperBase): | @@ -302,7 +302,7 @@ class DataILSVRC(DataDumperBase): | ||
| 302 | tsvfile = csv.reader(tsvfile, delimiter='\t') | 302 | tsvfile = csv.reader(tsvfile, delimiter='\t') |
| 303 | for line in tsvfile: | 303 | for line in tsvfile: |
| 304 | path_img = os.path.join(self.img_dir, line[0][:3], line[0][3:] + '.jpg') | 304 | path_img = os.path.join(self.img_dir, line[0][:3], line[0][3:] + '.jpg') |
| 305 | - if path_img: | 305 | + if path_img: |
| 306 | with open(path_img, 'rb') as fpic: | 306 | with open(path_img, 'rb') as fpic: |
| 307 | dict_databuf[line[0] + '.jpg'] = fpic.read() | 307 | dict_databuf[line[0] + '.jpg'] = fpic.read() |
| 308 | 308 |
| @@ -0,0 +1,367 @@ | @@ -0,0 +1,367 @@ | ||
| 1 | +__author__ = 'chunk' | ||
| 2 | + | ||
| 3 | +from . import * | ||
| 4 | +from ..mfeat import HOG, IntraBlockDiff | ||
| 5 | +from ..mspark import SC | ||
| 6 | +from ..common import * | ||
| 7 | + | ||
| 8 | +import os, sys | ||
| 9 | +from PIL import Image | ||
| 10 | +from hashlib import md5 | ||
| 11 | +import csv | ||
| 12 | +import shutil | ||
| 13 | +import json | ||
| 14 | +import collections | ||
| 15 | +import happybase | ||
| 16 | + | ||
| 17 | +from ..mjpeg import * | ||
| 18 | +from ..msteg import * | ||
| 19 | +from ..msteg.steganography import LSB, F3, F4, F5 | ||
| 20 | + | ||
| 21 | +import numpy as np | ||
| 22 | +from numpy.random import randn | ||
| 23 | +import pandas as pd | ||
| 24 | +from scipy import stats | ||
| 25 | + | ||
| 26 | +from subprocess import Popen, PIPE, STDOUT | ||
| 27 | +import tempfile | ||
| 28 | + | ||
| 29 | +np.random.seed(sum(map(ord, "whoami"))) | ||
| 30 | + | ||
| 31 | +package_dir = os.path.dirname(os.path.abspath(__file__)) | ||
| 32 | + | ||
| 33 | + | ||
| 34 | +class DataILSVRC_S(DataDumperBase): | ||
| 35 | + """ | ||
| 36 | + This module is specially for ILSVRC data processing under spark & hbase. | ||
| 37 | + | ||
| 38 | + We posit that the DB(e.g. HBase) has only the images data with md5 name as id. | ||
| 39 | + The task is to gennerate info(size,capacity,quality,etc.) and class & chosen tags, and then to perform embedding and finally to calcculate ibd features. | ||
| 40 | + | ||
| 41 | + Each step includes reading from & writing to Hbase (though PC). | ||
| 42 | + And each step must have a 'spark' mode option, which means that the operation is performed by spark with reading & wrting through RDDs. | ||
| 43 | + | ||
| 44 | + chunkplus@gmail.com | ||
| 45 | + """ | ||
| 46 | + | ||
| 47 | + def __init__(self, base_dir='/media/chunk/Elements/D/data/ImageNet/img/ILSVRC2013_DET_val', category='Train'): | ||
| 48 | + DataDumperBase.__init__(self, base_dir, category) | ||
| 49 | + | ||
| 50 | + self.base_dir = base_dir | ||
| 51 | + self.category = category | ||
| 52 | + | ||
| 53 | + self.dict_data = {} | ||
| 54 | + | ||
| 55 | + self.table_name = self.base_dir.strip('/').split('/')[-1] + '-' + self.category | ||
| 56 | + self.sparkcontex = None | ||
| 57 | + | ||
| 58 | + | ||
| 59 | + def get_table(self): | ||
| 60 | + if self.table != None: | ||
| 61 | + return self.table | ||
| 62 | + | ||
| 63 | + if self.connection is None: | ||
| 64 | + c = happybase.Connection('HPC-server') | ||
| 65 | + self.connection = c | ||
| 66 | + | ||
| 67 | + tables = self.connection.tables() | ||
| 68 | + if self.table_name not in tables: | ||
| 69 | + families = {'cf_pic': dict(), | ||
| 70 | + 'cf_info': dict(max_versions=10), | ||
| 71 | + 'cf_tag': dict(), | ||
| 72 | + 'cf_feat': dict(), | ||
| 73 | + } | ||
| 74 | + self.connection.create_table(name=self.table_name, families=families) | ||
| 75 | + | ||
| 76 | + table = self.connection.table(name=self.table_name) | ||
| 77 | + | ||
| 78 | + self.table = table | ||
| 79 | + | ||
| 80 | + return table | ||
| 81 | + | ||
| 82 | + def _get_info(self, img, info_rate=None, tag_chosen=None, tag_class=None): | ||
| 83 | + """ | ||
| 84 | + Tempfile is our friend. (?) | ||
| 85 | + """ | ||
| 86 | + info_rate = info_rate if info_rate != None else 0.0 | ||
| 87 | + tag_chosen = tag_chosen if tag_chosen != None else stats.bernoulli.rvs(0.8) | ||
| 88 | + tag_class = tag_class if tag_class != None else 0 | ||
| 89 | + try: | ||
| 90 | + tmpf = tempfile.NamedTemporaryFile(suffix='.jpg', mode='w+b') | ||
| 91 | + tmpf.write(img) | ||
| 92 | + tmpf.seek(0) | ||
| 93 | + im = Jpeg(tmpf.name, key=sample_key) | ||
| 94 | + info = [str(im.image_width), | ||
| 95 | + str(im.image_height), | ||
| 96 | + str(im.image_width * im.image_height), | ||
| 97 | + str(im.getCapacity()), | ||
| 98 | + str(im.getQuality()), | ||
| 99 | + str(info_rate), | ||
| 100 | + str(tag_chosen), | ||
| 101 | + str(tag_class)] | ||
| 102 | + return info | ||
| 103 | + except Exception as e: | ||
| 104 | + print e | ||
| 105 | + finally: | ||
| 106 | + tmpf.close() | ||
| 107 | + | ||
| 108 | + def _get_feat(self, image, feattype='ibd', **kwargs): | ||
| 109 | + size = kwargs.get('size', (48, 48)) | ||
| 110 | + | ||
| 111 | + if feattype == 'hog': | ||
| 112 | + feater = HOG.FeatHOG(size=size) | ||
| 113 | + elif feattype == 'ibd': | ||
| 114 | + feater = IntraBlockDiff.FeatIntraBlockDiff() | ||
| 115 | + else: | ||
| 116 | + raise Exception("Unknown feature type!") | ||
| 117 | + | ||
| 118 | + desc = feater.feat(image) | ||
| 119 | + | ||
| 120 | + return desc | ||
| 121 | + | ||
| 122 | + def _extract_data(self, mode='hbase', writeback=False): | ||
| 123 | + """ | ||
| 124 | + Get info barely out of image data. | ||
| 125 | + """ | ||
| 126 | + if mode == 'hbase': | ||
| 127 | + if self.table == None: | ||
| 128 | + self.table = self.get_table() | ||
| 129 | + | ||
| 130 | + cols = ['cf_pic:data'] | ||
| 131 | + for key, data in self.table.scan(columns=cols): | ||
| 132 | + data = data['cf_pic:data'] | ||
| 133 | + self.dict_data[key] = [data] + self._get_info(data) | ||
| 134 | + | ||
| 135 | + if not writeback: | ||
| 136 | + return self.dict_data | ||
| 137 | + else: | ||
| 138 | + try: | ||
| 139 | + with self.table.batch(batch_size=5000) as b: | ||
| 140 | + for imgname, imginfo in self.dict_data.items(): | ||
| 141 | + b.put(imgname, | ||
| 142 | + { | ||
| 143 | + # 'cf_pic:data': imginfo[0], | ||
| 144 | + 'cf_info:width': imginfo[1], | ||
| 145 | + 'cf_info:height': imginfo[2], | ||
| 146 | + 'cf_info:size': imginfo[3], | ||
| 147 | + 'cf_info:capacity': imginfo[4], | ||
| 148 | + 'cf_info:quality': imginfo[5], | ||
| 149 | + 'cf_info:rate': imginfo[6], | ||
| 150 | + 'cf_tag:chosen': imginfo[7], | ||
| 151 | + 'cf_tag:class': imginfo[8], | ||
| 152 | + }) | ||
| 153 | + except ValueError: | ||
| 154 | + raise | ||
| 155 | + | ||
| 156 | + | ||
| 157 | + elif mode == 'spark': | ||
| 158 | + pass | ||
| 159 | + else: | ||
| 160 | + raise Exception("Unknown mode!") | ||
| 161 | + | ||
| 162 | + | ||
| 163 | + def _embed_data(self, mode='hbase', rate=None, readforward=False, writeback=False): | ||
| 164 | + f5 = F5.F5(sample_key, 1) | ||
| 165 | + if mode == 'hbase': | ||
| 166 | + if self.table == None: | ||
| 167 | + self.table = self.get_table() | ||
| 168 | + | ||
| 169 | + if readforward: | ||
| 170 | + self.dict_data = {} | ||
| 171 | + cols = ['cf_pic:data', | ||
| 172 | + 'cf_info:width', | ||
| 173 | + 'cf_info:height', | ||
| 174 | + 'cf_info:size', | ||
| 175 | + 'cf_info:capacity', | ||
| 176 | + 'cf_info:quality', | ||
| 177 | + 'cf_info:rate', | ||
| 178 | + 'cf_tag:chosen', | ||
| 179 | + 'cf_tag:class'] | ||
| 180 | + for key, data in self.table.scan(columns=cols): | ||
| 181 | + data = [data[k] for k in cols] | ||
| 182 | + self.dict_data[key] = data | ||
| 183 | + | ||
| 184 | + dict_data_ext = {} | ||
| 185 | + | ||
| 186 | + for imgname, imgdata in self.dict_data.items(): | ||
| 187 | + try: | ||
| 188 | + tmpf_src = tempfile.NamedTemporaryFile(suffix='.jpg', mode='w+b') | ||
| 189 | + tmpf_src.write(imgdata[0]) | ||
| 190 | + tmpf_src.seek(0) | ||
| 191 | + tmpf_dst = tempfile.NamedTemporaryFile(suffix='.jpg', mode='w+b') | ||
| 192 | + | ||
| 193 | + if rate == None: | ||
| 194 | + embed_rate = f5.embed_raw_data(tmpf_src.name, os.path.join(package_dir, '../res/toembed'), tmpf_dst.name) | ||
| 195 | + else: | ||
| 196 | + assert (rate >= 0 and rate < 1) | ||
| 197 | + # print capacity | ||
| 198 | + hidden = np.random.bytes(int(int(imgdata[4]) * rate) / 8) | ||
| 199 | + embed_rate = f5.embed_raw_data(tmpf_src.name, hidden, tmpf_dst.name, frommem=True) | ||
| 200 | + | ||
| 201 | + tmpf_dst.seek(0) | ||
| 202 | + raw = tmpf_dst.read() | ||
| 203 | + index = md5(raw).hexdigest() | ||
| 204 | + dict_data_ext[index + '.jpg'] = [raw] + self._get_info(raw, embed_rate, 0, 1) | ||
| 205 | + | ||
| 206 | + | ||
| 207 | + except Exception as e: | ||
| 208 | + print e | ||
| 209 | + raise | ||
| 210 | + finally: | ||
| 211 | + tmpf_src.close() | ||
| 212 | + tmpf_dst.close() | ||
| 213 | + | ||
| 214 | + self.dict_data.update(dict_data_ext) | ||
| 215 | + | ||
| 216 | + if not writeback: | ||
| 217 | + return self.dict_data | ||
| 218 | + else: | ||
| 219 | + try: | ||
| 220 | + with self.table.batch(batch_size=5000) as b: | ||
| 221 | + for imgname, imginfo in dict_data_ext.items(): | ||
| 222 | + b.put(imgname, | ||
| 223 | + { | ||
| 224 | + 'cf_pic:data': imginfo[0], | ||
| 225 | + 'cf_info:width': imginfo[1], | ||
| 226 | + 'cf_info:height': imginfo[2], | ||
| 227 | + 'cf_info:size': imginfo[3], | ||
| 228 | + 'cf_info:capacity': imginfo[4], | ||
| 229 | + 'cf_info:quality': imginfo[5], | ||
| 230 | + 'cf_info:rate': imginfo[6], | ||
| 231 | + 'cf_tag:chosen': imginfo[7], | ||
| 232 | + 'cf_tag:class': imginfo[8], }) | ||
| 233 | + except ValueError: | ||
| 234 | + raise | ||
| 235 | + | ||
| 236 | + elif mode == 'spark': | ||
| 237 | + pass | ||
| 238 | + else: | ||
| 239 | + raise Exception("Unknown mode!") | ||
| 240 | + | ||
| 241 | + | ||
| 242 | + def _extract_feat(self, mode='hbase', feattype='ibd', readforward=False, writeback=False, **kwargs): | ||
| 243 | + if mode == 'hbase': | ||
| 244 | + if self.table == None: | ||
| 245 | + self.table = self.get_table() | ||
| 246 | + | ||
| 247 | + if readforward: | ||
| 248 | + self.dict_data = {} | ||
| 249 | + cols = ['cf_pic:data', | ||
| 250 | + 'cf_info:width', | ||
| 251 | + 'cf_info:height', | ||
| 252 | + 'cf_info:size', | ||
| 253 | + 'cf_info:capacity', | ||
| 254 | + 'cf_info:quality', | ||
| 255 | + 'cf_info:rate', | ||
| 256 | + 'cf_tag:chosen', | ||
| 257 | + 'cf_tag:class'] | ||
| 258 | + for key, data in self.table.scan(columns=cols): | ||
| 259 | + data = [data[k] for k in cols] | ||
| 260 | + self.dict_data[key] = data | ||
| 261 | + | ||
| 262 | + for imgname, imgdata in self.dict_data.items(): | ||
| 263 | + try: | ||
| 264 | + tmpf_src = tempfile.NamedTemporaryFile(suffix='.jpg', mode='w+b') | ||
| 265 | + tmpf_src.write(imgdata[0]) | ||
| 266 | + tmpf_src.seek(0) | ||
| 267 | + | ||
| 268 | + desc = json.dumps(self._get_feat(tmpf_src.name, feattype=feattype).tolist()) | ||
| 269 | + | ||
| 270 | + self.dict_data[imgname].append(desc) | ||
| 271 | + | ||
| 272 | + except Exception as e: | ||
| 273 | + print e | ||
| 274 | + raise | ||
| 275 | + finally: | ||
| 276 | + tmpf_src.close() | ||
| 277 | + | ||
| 278 | + if not writeback: | ||
| 279 | + return self.dict_data | ||
| 280 | + else: | ||
| 281 | + try: | ||
| 282 | + with self.table.batch(batch_size=5000) as b: | ||
| 283 | + for imgname, imginfo in self.dict_data.items(): | ||
| 284 | + b.put(imgname, | ||
| 285 | + { | ||
| 286 | + 'cf_pic:data': imginfo[0], | ||
| 287 | + 'cf_info:width': imginfo[1], | ||
| 288 | + 'cf_info:height': imginfo[2], | ||
| 289 | + 'cf_info:size': imginfo[3], | ||
| 290 | + 'cf_info:capacity': imginfo[4], | ||
| 291 | + 'cf_info:quality': imginfo[5], | ||
| 292 | + 'cf_info:rate': imginfo[6], | ||
| 293 | + 'cf_tag:chosen': imginfo[7], | ||
| 294 | + 'cf_tag:class': imginfo[8], | ||
| 295 | + 'cf_feat:' + feattype: imginfo[9]}) | ||
| 296 | + except ValueError: | ||
| 297 | + raise | ||
| 298 | + | ||
| 299 | + elif mode == 'spark': | ||
| 300 | + pass | ||
| 301 | + else: | ||
| 302 | + raise Exception("Unknown mode!") | ||
| 303 | + | ||
| 304 | + | ||
| 305 | + def format(self): | ||
| 306 | + self._extract_data(mode='hbase', writeback=False) | ||
| 307 | + self._embed_data(mode='hbase', rate=0.1, readforward=False, writeback=False) | ||
| 308 | + self._extract_feat(mode='hbase', feattype='ibd', readforward=False, writeback=True) | ||
| 309 | + | ||
| 310 | + | ||
| 311 | + def load_data(self, mode='local', feattype='ibd', tagtype='class'): | ||
| 312 | + INDEX = [] | ||
| 313 | + X = [] | ||
| 314 | + Y = [] | ||
| 315 | + | ||
| 316 | + if mode == "local": | ||
| 317 | + | ||
| 318 | + dict_dataset = {} | ||
| 319 | + | ||
| 320 | + with open(self.list_file, 'rb') as tsvfile: | ||
| 321 | + tsvfile = csv.reader(tsvfile, delimiter='\t') | ||
| 322 | + for line in tsvfile: | ||
| 323 | + hash = line[0] | ||
| 324 | + tag = line[-1] | ||
| 325 | + path_feat = os.path.join(self.feat_dir, hash[:3], hash[3:] + '.' + feattype) | ||
| 326 | + if path_feat: | ||
| 327 | + with open(path_feat, 'rb') as featfile: | ||
| 328 | + dict_dataset[hash] = (tag, json.loads(featfile.read())) | ||
| 329 | + | ||
| 330 | + for tag, feat in dict_dataset.values(): | ||
| 331 | + X.append([item for sublist in feat for subsublist in sublist for item in subsublist]) | ||
| 332 | + Y.append(int(tag)) | ||
| 333 | + | ||
| 334 | + elif mode == "remote" or mode == "hbase": | ||
| 335 | + if self.table == None: | ||
| 336 | + self.table = self.get_table() | ||
| 337 | + | ||
| 338 | + col_feat, col_tag = 'cf_feat:' + feattype, 'cf_tag:' + tagtype | ||
| 339 | + for key, data in self.table.scan(columns=[col_feat, col_tag]): | ||
| 340 | + X.append(json.loads(data[col_feat])) | ||
| 341 | + Y.append(1 if data[col_tag] == 'True' else 0) | ||
| 342 | + | ||
| 343 | + elif mode == "spark" or mode == "cluster": | ||
| 344 | + if self.sparkcontex == None: | ||
| 345 | + self.sparkcontex = SC.Sparker(host='HPC-server', appname='ImageCV', master='spark://HPC-server:7077') | ||
| 346 | + | ||
| 347 | + result = self.sparkcontex.read_hbase(self.table_name) # result = {key:[feat,tag],...} | ||
| 348 | + for feat, tag in result: | ||
| 349 | + X.append(feat) | ||
| 350 | + Y.append(tag) | ||
| 351 | + | ||
| 352 | + else: | ||
| 353 | + raise Exception("Unknown mode!") | ||
| 354 | + | ||
| 355 | + return X, Y | ||
| 356 | + | ||
| 357 | + | ||
| 358 | + | ||
| 359 | + | ||
| 360 | + | ||
| 361 | + | ||
| 362 | + | ||
| 363 | + | ||
| 364 | + | ||
| 365 | + | ||
| 366 | + | ||
| 367 | + |
test/test_data.py
| @@ -2,7 +2,7 @@ __author__ = 'chunk' | @@ -2,7 +2,7 @@ __author__ = 'chunk' | ||
| 2 | 2 | ||
| 3 | from ..common import * | 3 | from ..common import * |
| 4 | 4 | ||
| 5 | -from ..mdata import MSR, CV, ILSVRC | 5 | +from ..mdata import MSR, CV, ILSVRC, ILSVRC_S |
| 6 | 6 | ||
| 7 | 7 | ||
| 8 | def test_MSR(): | 8 | def test_MSR(): |
| @@ -31,30 +31,38 @@ def test_CV(): | @@ -31,30 +31,38 @@ def test_CV(): | ||
| 31 | def test_ILSVRC(): | 31 | def test_ILSVRC(): |
| 32 | timer = Timer() | 32 | timer = Timer() |
| 33 | # dil = ILSVRC.DataILSVRC(base_dir='/data/hadoop/ImageNet/ILSVRC/ILSVRC2013_DET_val', category='Train') | 33 | # dil = ILSVRC.DataILSVRC(base_dir='/data/hadoop/ImageNet/ILSVRC/ILSVRC2013_DET_val', category='Train') |
| 34 | - dil = ILSVRC.DataILSVRC(base_dir='/data/hadoop/ImageNet/ILSVRC/ILSVRC2013_DET_val', category='Train_1') | 34 | + dil = ILSVRC.DataILSVRC(base_dir='/data/hadoop/ImageNet/ILSVRC/ILSVRC2013_DET_val', category='Train_2') |
| 35 | # dil = ILSVRC.DataILSVRC(base_dir='/media/chunk/Elements/D/data/ImageNet/img/ILSVRC2013_DET_val', category='Train_1') | 35 | # dil = ILSVRC.DataILSVRC(base_dir='/media/chunk/Elements/D/data/ImageNet/img/ILSVRC2013_DET_val', category='Train_1') |
| 36 | 36 | ||
| 37 | dil.format() | 37 | dil.format() |
| 38 | - dil.embed(rate=0.1) | ||
| 39 | - dil.extract_feat(feattype='ibd') | 38 | + # dil.embed(rate=0.1) |
| 39 | + # dil.extract_feat(feattype='ibd') | ||
| 40 | # dil.extract_feat(feattype='hog') | 40 | # dil.extract_feat(feattype='hog') |
| 41 | 41 | ||
| 42 | timer.mark() | 42 | timer.mark() |
| 43 | dil.store_img() | 43 | dil.store_img() |
| 44 | timer.report() | 44 | timer.report() |
| 45 | - | ||
| 46 | - timer.mark() | ||
| 47 | - dil.store_tag() | ||
| 48 | - timer.report() | ||
| 49 | 45 | ||
| 50 | - timer.mark() | ||
| 51 | - dil.store_info() | ||
| 52 | - timer.report() | 46 | + # timer.mark() |
| 47 | + # dil.store_tag() | ||
| 48 | + # timer.report() | ||
| 49 | + # | ||
| 50 | + # timer.mark() | ||
| 51 | + # dil.store_info() | ||
| 52 | + # timer.report() | ||
| 53 | + # | ||
| 54 | + # timer.mark() | ||
| 55 | + # dil.store_feat() | ||
| 56 | + # timer.report() | ||
| 53 | 57 | ||
| 54 | - timer.mark() | ||
| 55 | - dil.store_feat() | ||
| 56 | - timer.report() | ||
| 57 | 58 | ||
| 59 | +def test_ILSVRC_S(): | ||
| 60 | + timer = Timer() | ||
| 61 | + dils = ILSVRC_S.DataILSVRC_S(base_dir='/data/hadoop/ImageNet/ILSVRC/ILSVRC2013_DET_val', category='Train_2') | ||
| 62 | + | ||
| 63 | + dils._extract_data(mode='hbase', writeback=True) | ||
| 64 | + dils._embed_data(mode='hbase', rate=0.1, readforward=True, writeback=True) | ||
| 65 | + dils._extract_feat( mode='hbase', feattype='ibd', readforward=True, writeback=True) | ||
| 58 | 66 | ||
| 59 | if __name__ == '__main__': | 67 | if __name__ == '__main__': |
| 60 | # test_MSR() | 68 | # test_MSR() |