Commit 24768a990fbda84a2df0b387178ce4039863d423

Authored by Chunk
1 parent f25fd27c
Exists in master and in 1 other branch refactor

mode 'hbase' finished. (testing is sometimes interesting, especially when your c…

…ode is well structured and with few bugs! )
mdata/ILSVRC-S.py
@@ -1,359 +0,0 @@ @@ -1,359 +0,0 @@
1 -__author__ = 'chunk'  
2 -  
3 -from . import *  
4 -from ..mfeat import HOG, IntraBlockDiff  
5 -from ..mspark import SC  
6 -from ..common import *  
7 -  
8 -import os, sys  
9 -from PIL import Image  
10 -from hashlib import md5  
11 -import csv  
12 -import shutil  
13 -import json  
14 -import collections  
15 -import happybase  
16 -  
17 -from ..mjpeg import *  
18 -from ..msteg import *  
19 -from ..msteg.steganography import LSB, F3, F4, F5  
20 -  
21 -import numpy as np  
22 -from numpy.random import randn  
23 -import pandas as pd  
24 -from scipy import stats  
25 -  
26 -from subprocess import Popen, PIPE, STDOUT  
27 -import tempfile  
28 -  
29 -np.random.seed(sum(map(ord, "whoami")))  
30 -  
31 -package_dir = os.path.dirname(os.path.abspath(__file__))  
32 -  
33 -  
34 -class DataILSVRCS(DataDumperBase):  
35 - """  
36 - This module is specially for ILSVRC data processing under spark & hbase.  
37 -  
38 - We posit that the DB(e.g. HBase) has only the images data with md5 name as id.  
39 - The task is to gennerate info(size,capacity,quality,etc.) and class & chosen tags, and then to perform embedding and finally to calcculate ibd features.  
40 -  
41 - Each step includes reading from & writing to Hbase (though PC).  
42 - And each step must have a 'spark' mode option, which means that the operation is performed by spark with reading & wrting through RDDs.  
43 -  
44 - chunkplus@gmail.com  
45 - """  
46 -  
47 - def __init__(self, base_dir='/media/chunk/Elements/D/data/ImageNet/img/ILSVRC2013_DET_val', category='Train'):  
48 - DataDumperBase.__init__(self, base_dir, category)  
49 -  
50 - self.base_dir = base_dir  
51 - self.category = category  
52 -  
53 - self.dict_data = {}  
54 -  
55 - self.table_name = self.base_dir.strip('/').split('/')[-1] + '-' + self.category  
56 - self.sparkcontex = None  
57 -  
58 -  
59 - def _get_table(self):  
60 - if self.table != None:  
61 - return self.table  
62 -  
63 - if self.connection is None:  
64 - c = happybase.Connection('HPC-server')  
65 - self.connection = c  
66 -  
67 - tables = self.connection.tables()  
68 - if self.table_name not in tables:  
69 - families = {'cf_pic': dict(),  
70 - 'cf_info': dict(max_versions=10),  
71 - 'cf_tag': dict(),  
72 - 'cf_feat': dict(),  
73 - }  
74 - self.connection.create_table(name=self.table_name, families=families)  
75 -  
76 - table = self.connection.table(name=self.table_name)  
77 -  
78 - self.table = table  
79 -  
80 - return table  
81 -  
82 - def _get_info(self, img, info_rate=None, tag_chosen=None, tag_class=None):  
83 - """  
84 - Tempfile is our friend. (?)  
85 - """  
86 - info_rate = info_rate if info_rate != None else 0.0  
87 - tag_chosen = tag_chosen if tag_chosen != None else stats.bernoulli.rvs(0.8)  
88 - tag_class = tag_class if tag_class != None else 0  
89 - try:  
90 - tmpf = tempfile.NamedTemporaryFile(suffix='.jpg', mode='w+b')  
91 - tmpf.write(img)  
92 - # tmpf.seek(0)  
93 - im = Jpeg(tmpf.name, key=sample_key)  
94 - info = [im.image_width,  
95 - im.image_height,  
96 - im.image_width * im.image_height,  
97 - im.getCapacity(),  
98 - im.getQuality(),  
99 - info_rate,  
100 - tag_chosen,  
101 - tag_class]  
102 - return info  
103 - except Exception as e:  
104 - print e  
105 - finally:  
106 - tmpf.close()  
107 -  
108 - def _get_feat(self, image, feattype='ibd', **kwargs):  
109 - size = kwargs.get('size', (48, 48))  
110 -  
111 - if feattype == 'hog':  
112 - feater = HOG.FeatHOG(size=size)  
113 - elif feattype == 'ibd':  
114 - feater = IntraBlockDiff.FeatIntraBlockDiff()  
115 - else:  
116 - raise Exception("Unknown feature type!")  
117 -  
118 - desc = feater.feat(image)  
119 -  
120 - return desc  
121 -  
122 - def _extract_data(self, mode='hbase', writeback=False):  
123 - """  
124 - Get info barely out of image data.  
125 - """  
126 - if mode == 'hbase':  
127 - if self.table == None:  
128 - self.table = self.get_table()  
129 -  
130 - cols = ['cf_pic:data']  
131 - for key, data in self.table.scan(columns=cols, scan_batching=True):  
132 - self.dict_data[key] = [data] + self._get_info(data)  
133 -  
134 - if not writeback:  
135 - return self.dict_data  
136 - else:  
137 - try:  
138 - with self.table.batch(batch_size=5000) as b:  
139 - for imgname, imginfo in self.dict_data.items():  
140 - b.put(imgname,  
141 - {  
142 - # 'cf_pic:data': imginfo[0],  
143 - 'cf_info:width': imginfo[1],  
144 - 'cf_info:height': imginfo[2],  
145 - 'cf_info:size': imginfo[3],  
146 - 'cf_info:capacity': imginfo[4],  
147 - 'cf_info:quality': imginfo[5],  
148 - 'cf_info:rate': imginfo[6],  
149 - 'cf_tag:chosen': imginfo[7],  
150 - 'cf_tag:class': imginfo[8], })  
151 - except ValueError:  
152 - raise  
153 -  
154 -  
155 - elif mode == 'spark':  
156 - pass  
157 - else:  
158 - raise Exception("Unknown mode!")  
159 -  
160 -  
161 - def _embed_data(self, mode='hbase', rate=None, readforward=False, writeback=False):  
162 - f5 = F5.F5(sample_key, 1)  
163 - if mode == 'hbase':  
164 - if self.table == None:  
165 - self.table = self.get_table()  
166 -  
167 - if readforward:  
168 - self.dict_data = {}  
169 - cols = ['cf_pic:data',  
170 - 'cf_info:width',  
171 - 'cf_info:height',  
172 - 'cf_info:size',  
173 - 'cf_info:capacity',  
174 - 'cf_info:quality',  
175 - 'cf_info:rate',  
176 - 'cf_tag:chosen',  
177 - 'cf_tag:class']  
178 - for key, data in self.table.scan(columns=cols, scan_batching=True):  
179 - self.dict_data[key] = data  
180 -  
181 - dict_data_ext = {}  
182 -  
183 - for imgname, imgdata in self.dict_data.items():  
184 - try:  
185 - tmpf_src = tempfile.NamedTemporaryFile(suffix='.jpg', mode='w+b')  
186 - tmpf_src.write(imgdata[0])  
187 - tmpf_dst = tempfile.NamedTemporaryFile(suffix='.jpg', mode='w+b')  
188 -  
189 - if rate == None:  
190 - embed_rate = f5.embed_raw_data(tmpf_src, os.path.join(package_dir, '../res/toembed'), tmpf_dst)  
191 - else:  
192 - assert (rate >= 0 and rate < 1)  
193 - # print capacity  
194 - hidden = np.random.bytes(int(imgdata[4] * rate) / 8)  
195 - embed_rate = f5.embed_raw_data(tmpf_src, hidden, tmpf_dst, frommem=True)  
196 -  
197 - tmpf_dst.seek(0)  
198 - raw = tmpf_dst.read()  
199 - index = md5(raw).hexdigest()  
200 - dict_data_ext[index + '.jpg'] = [raw] + self._get_info(raw, embed_rate, 0, 1)  
201 -  
202 -  
203 - except Exception as e:  
204 - print e  
205 - finally:  
206 - tmpf_src.close()  
207 - tmpf_dst.close()  
208 -  
209 - self.dict_data.update(dict_data_ext)  
210 -  
211 - if not writeback:  
212 - return self.dict_data  
213 - else:  
214 - try:  
215 - with self.table.batch(batch_size=5000) as b:  
216 - for imgname, imginfo in dict_data_ext.items():  
217 - b.put(imgname,  
218 - {  
219 - 'cf_pic:data': imginfo[0],  
220 - 'cf_info:width': imginfo[1],  
221 - 'cf_info:height': imginfo[2],  
222 - 'cf_info:size': imginfo[3],  
223 - 'cf_info:capacity': imginfo[4],  
224 - 'cf_info:quality': imginfo[5],  
225 - 'cf_info:rate': imginfo[6],  
226 - 'cf_tag:chosen': imginfo[7],  
227 - 'cf_tag:class': imginfo[8], })  
228 - except ValueError:  
229 - raise  
230 -  
231 - elif mode == 'spark':  
232 - pass  
233 - else:  
234 - raise Exception("Unknown mode!")  
235 -  
236 -  
237 - def _extract_feat(self, mode='hbase', feattype='ibd', readforward=False, writeback=False, **kwargs):  
238 - if mode == 'hbase':  
239 - if self.table == None:  
240 - self.table = self.get_table()  
241 -  
242 - if readforward:  
243 - self.dict_data = {}  
244 - cols = ['cf_pic:data',  
245 - 'cf_info:width',  
246 - 'cf_info:height',  
247 - 'cf_info:size',  
248 - 'cf_info:capacity',  
249 - 'cf_info:quality',  
250 - 'cf_info:rate',  
251 - 'cf_tag:chosen',  
252 - 'cf_tag:class']  
253 - for key, data in self.table.scan(columns=cols, scan_batching=True):  
254 - self.dict_data[key] = data  
255 -  
256 - for imgname, imgdata in self.dict_data.items():  
257 - try:  
258 - tmpf_src = tempfile.NamedTemporaryFile(suffix='.jpg', mode='w+b')  
259 - tmpf_src.write(imgdata[0])  
260 -  
261 - desc = json.dumps(self._get_feat(tmpf_src, feattype=feattype))  
262 -  
263 - self.dict_data[imgname].append(desc)  
264 -  
265 - except Exception as e:  
266 - print e  
267 - finally:  
268 - tmpf_src.close()  
269 -  
270 - if not writeback:  
271 - return self.dict_data  
272 - else:  
273 - try:  
274 - with self.table.batch(batch_size=5000) as b:  
275 - for imgname, imginfo in self.dict_data.items():  
276 - b.put(imgname,  
277 - {  
278 - 'cf_pic:data': imginfo[0],  
279 - 'cf_info:width': imginfo[1],  
280 - 'cf_info:height': imginfo[2],  
281 - 'cf_info:size': imginfo[3],  
282 - 'cf_info:capacity': imginfo[4],  
283 - 'cf_info:quality': imginfo[5],  
284 - 'cf_info:rate': imginfo[6],  
285 - 'cf_tag:chosen': imginfo[7],  
286 - 'cf_tag:class': imginfo[8],  
287 - 'cf_feat:' + feattype: imginfo[9]})  
288 - except ValueError:  
289 - raise  
290 -  
291 - elif mode == 'spark':  
292 - pass  
293 - else:  
294 - raise Exception("Unknown mode!")  
295 -  
296 -  
297 - def format(self):  
298 - self._extract_data(mode='hbase', writeback=False)  
299 - self._embed_data(mode='hbase', rate=0.1, readforward=False, writeback=False)  
300 - self._extract_feat(mode='hbase', feattype='ibd', readforward=False, writeback=True)  
301 -  
302 -  
303 - def load_data(self, mode='local', feattype='ibd', tagtype='class'):  
304 - INDEX = []  
305 - X = []  
306 - Y = []  
307 -  
308 - if mode == "local":  
309 -  
310 - dict_dataset = {}  
311 -  
312 - with open(self.list_file, 'rb') as tsvfile:  
313 - tsvfile = csv.reader(tsvfile, delimiter='\t')  
314 - for line in tsvfile:  
315 - hash = line[0]  
316 - tag = line[-1]  
317 - path_feat = os.path.join(self.feat_dir, hash[:3], hash[3:] + '.' + feattype)  
318 - if path_feat:  
319 - with open(path_feat, 'rb') as featfile:  
320 - dict_dataset[hash] = (tag, json.loads(featfile.read()))  
321 -  
322 - for tag, feat in dict_dataset.values():  
323 - X.append([item for sublist in feat for subsublist in sublist for item in subsublist])  
324 - Y.append(int(tag))  
325 -  
326 - elif mode == "remote" or mode == "hbase":  
327 - if self.table == None:  
328 - self.table = self.get_table()  
329 -  
330 - col_feat, col_tag = 'cf_feat:' + feattype, 'cf_tag:' + tagtype  
331 - for key, data in self.table.scan(columns=[col_feat, col_tag]):  
332 - X.append(json.loads(data[col_feat]))  
333 - Y.append(1 if data[col_tag] == 'True' else 0)  
334 -  
335 - elif mode == "spark" or mode == "cluster":  
336 - if self.sparkcontex == None:  
337 - self.sparkcontex = SC.Sparker(host='HPC-server', appname='ImageCV', master='spark://HPC-server:7077')  
338 -  
339 - result = self.sparkcontex.read_hbase(self.table_name) # result = {key:[feat,tag],...}  
340 - for feat, tag in result:  
341 - X.append(feat)  
342 - Y.append(tag)  
343 -  
344 - else:  
345 - raise Exception("Unknown mode!")  
346 -  
347 - return X, Y  
348 -  
349 -  
350 -  
351 -  
352 -  
353 -  
354 -  
355 -  
356 -  
357 -  
358 -  
359 -  
mdata/ILSVRC.py
@@ -302,7 +302,7 @@ class DataILSVRC(DataDumperBase): @@ -302,7 +302,7 @@ class DataILSVRC(DataDumperBase):
302 tsvfile = csv.reader(tsvfile, delimiter='\t') 302 tsvfile = csv.reader(tsvfile, delimiter='\t')
303 for line in tsvfile: 303 for line in tsvfile:
304 path_img = os.path.join(self.img_dir, line[0][:3], line[0][3:] + '.jpg') 304 path_img = os.path.join(self.img_dir, line[0][:3], line[0][3:] + '.jpg')
305 - if path_img: 305 + if path_img:
306 with open(path_img, 'rb') as fpic: 306 with open(path_img, 'rb') as fpic:
307 dict_databuf[line[0] + '.jpg'] = fpic.read() 307 dict_databuf[line[0] + '.jpg'] = fpic.read()
308 308
mdata/ILSVRC_S.py 0 → 100644
@@ -0,0 +1,367 @@ @@ -0,0 +1,367 @@
  1 +__author__ = 'chunk'
  2 +
  3 +from . import *
  4 +from ..mfeat import HOG, IntraBlockDiff
  5 +from ..mspark import SC
  6 +from ..common import *
  7 +
  8 +import os, sys
  9 +from PIL import Image
  10 +from hashlib import md5
  11 +import csv
  12 +import shutil
  13 +import json
  14 +import collections
  15 +import happybase
  16 +
  17 +from ..mjpeg import *
  18 +from ..msteg import *
  19 +from ..msteg.steganography import LSB, F3, F4, F5
  20 +
  21 +import numpy as np
  22 +from numpy.random import randn
  23 +import pandas as pd
  24 +from scipy import stats
  25 +
  26 +from subprocess import Popen, PIPE, STDOUT
  27 +import tempfile
  28 +
  29 +np.random.seed(sum(map(ord, "whoami")))
  30 +
  31 +package_dir = os.path.dirname(os.path.abspath(__file__))
  32 +
  33 +
  34 +class DataILSVRC_S(DataDumperBase):
  35 + """
  36 + This module is specially for ILSVRC data processing under spark & hbase.
  37 +
  38 + We posit that the DB(e.g. HBase) has only the images data with md5 name as id.
  39 + The task is to gennerate info(size,capacity,quality,etc.) and class & chosen tags, and then to perform embedding and finally to calcculate ibd features.
  40 +
  41 + Each step includes reading from & writing to Hbase (though PC).
  42 + And each step must have a 'spark' mode option, which means that the operation is performed by spark with reading & wrting through RDDs.
  43 +
  44 + chunkplus@gmail.com
  45 + """
  46 +
  47 + def __init__(self, base_dir='/media/chunk/Elements/D/data/ImageNet/img/ILSVRC2013_DET_val', category='Train'):
  48 + DataDumperBase.__init__(self, base_dir, category)
  49 +
  50 + self.base_dir = base_dir
  51 + self.category = category
  52 +
  53 + self.dict_data = {}
  54 +
  55 + self.table_name = self.base_dir.strip('/').split('/')[-1] + '-' + self.category
  56 + self.sparkcontex = None
  57 +
  58 +
  59 + def get_table(self):
  60 + if self.table != None:
  61 + return self.table
  62 +
  63 + if self.connection is None:
  64 + c = happybase.Connection('HPC-server')
  65 + self.connection = c
  66 +
  67 + tables = self.connection.tables()
  68 + if self.table_name not in tables:
  69 + families = {'cf_pic': dict(),
  70 + 'cf_info': dict(max_versions=10),
  71 + 'cf_tag': dict(),
  72 + 'cf_feat': dict(),
  73 + }
  74 + self.connection.create_table(name=self.table_name, families=families)
  75 +
  76 + table = self.connection.table(name=self.table_name)
  77 +
  78 + self.table = table
  79 +
  80 + return table
  81 +
  82 + def _get_info(self, img, info_rate=None, tag_chosen=None, tag_class=None):
  83 + """
  84 + Tempfile is our friend. (?)
  85 + """
  86 + info_rate = info_rate if info_rate != None else 0.0
  87 + tag_chosen = tag_chosen if tag_chosen != None else stats.bernoulli.rvs(0.8)
  88 + tag_class = tag_class if tag_class != None else 0
  89 + try:
  90 + tmpf = tempfile.NamedTemporaryFile(suffix='.jpg', mode='w+b')
  91 + tmpf.write(img)
  92 + tmpf.seek(0)
  93 + im = Jpeg(tmpf.name, key=sample_key)
  94 + info = [str(im.image_width),
  95 + str(im.image_height),
  96 + str(im.image_width * im.image_height),
  97 + str(im.getCapacity()),
  98 + str(im.getQuality()),
  99 + str(info_rate),
  100 + str(tag_chosen),
  101 + str(tag_class)]
  102 + return info
  103 + except Exception as e:
  104 + print e
  105 + finally:
  106 + tmpf.close()
  107 +
  108 + def _get_feat(self, image, feattype='ibd', **kwargs):
  109 + size = kwargs.get('size', (48, 48))
  110 +
  111 + if feattype == 'hog':
  112 + feater = HOG.FeatHOG(size=size)
  113 + elif feattype == 'ibd':
  114 + feater = IntraBlockDiff.FeatIntraBlockDiff()
  115 + else:
  116 + raise Exception("Unknown feature type!")
  117 +
  118 + desc = feater.feat(image)
  119 +
  120 + return desc
  121 +
  122 + def _extract_data(self, mode='hbase', writeback=False):
  123 + """
  124 + Get info barely out of image data.
  125 + """
  126 + if mode == 'hbase':
  127 + if self.table == None:
  128 + self.table = self.get_table()
  129 +
  130 + cols = ['cf_pic:data']
  131 + for key, data in self.table.scan(columns=cols):
  132 + data = data['cf_pic:data']
  133 + self.dict_data[key] = [data] + self._get_info(data)
  134 +
  135 + if not writeback:
  136 + return self.dict_data
  137 + else:
  138 + try:
  139 + with self.table.batch(batch_size=5000) as b:
  140 + for imgname, imginfo in self.dict_data.items():
  141 + b.put(imgname,
  142 + {
  143 + # 'cf_pic:data': imginfo[0],
  144 + 'cf_info:width': imginfo[1],
  145 + 'cf_info:height': imginfo[2],
  146 + 'cf_info:size': imginfo[3],
  147 + 'cf_info:capacity': imginfo[4],
  148 + 'cf_info:quality': imginfo[5],
  149 + 'cf_info:rate': imginfo[6],
  150 + 'cf_tag:chosen': imginfo[7],
  151 + 'cf_tag:class': imginfo[8],
  152 + })
  153 + except ValueError:
  154 + raise
  155 +
  156 +
  157 + elif mode == 'spark':
  158 + pass
  159 + else:
  160 + raise Exception("Unknown mode!")
  161 +
  162 +
  163 + def _embed_data(self, mode='hbase', rate=None, readforward=False, writeback=False):
  164 + f5 = F5.F5(sample_key, 1)
  165 + if mode == 'hbase':
  166 + if self.table == None:
  167 + self.table = self.get_table()
  168 +
  169 + if readforward:
  170 + self.dict_data = {}
  171 + cols = ['cf_pic:data',
  172 + 'cf_info:width',
  173 + 'cf_info:height',
  174 + 'cf_info:size',
  175 + 'cf_info:capacity',
  176 + 'cf_info:quality',
  177 + 'cf_info:rate',
  178 + 'cf_tag:chosen',
  179 + 'cf_tag:class']
  180 + for key, data in self.table.scan(columns=cols):
  181 + data = [data[k] for k in cols]
  182 + self.dict_data[key] = data
  183 +
  184 + dict_data_ext = {}
  185 +
  186 + for imgname, imgdata in self.dict_data.items():
  187 + try:
  188 + tmpf_src = tempfile.NamedTemporaryFile(suffix='.jpg', mode='w+b')
  189 + tmpf_src.write(imgdata[0])
  190 + tmpf_src.seek(0)
  191 + tmpf_dst = tempfile.NamedTemporaryFile(suffix='.jpg', mode='w+b')
  192 +
  193 + if rate == None:
  194 + embed_rate = f5.embed_raw_data(tmpf_src.name, os.path.join(package_dir, '../res/toembed'), tmpf_dst.name)
  195 + else:
  196 + assert (rate >= 0 and rate < 1)
  197 + # print capacity
  198 + hidden = np.random.bytes(int(int(imgdata[4]) * rate) / 8)
  199 + embed_rate = f5.embed_raw_data(tmpf_src.name, hidden, tmpf_dst.name, frommem=True)
  200 +
  201 + tmpf_dst.seek(0)
  202 + raw = tmpf_dst.read()
  203 + index = md5(raw).hexdigest()
  204 + dict_data_ext[index + '.jpg'] = [raw] + self._get_info(raw, embed_rate, 0, 1)
  205 +
  206 +
  207 + except Exception as e:
  208 + print e
  209 + raise
  210 + finally:
  211 + tmpf_src.close()
  212 + tmpf_dst.close()
  213 +
  214 + self.dict_data.update(dict_data_ext)
  215 +
  216 + if not writeback:
  217 + return self.dict_data
  218 + else:
  219 + try:
  220 + with self.table.batch(batch_size=5000) as b:
  221 + for imgname, imginfo in dict_data_ext.items():
  222 + b.put(imgname,
  223 + {
  224 + 'cf_pic:data': imginfo[0],
  225 + 'cf_info:width': imginfo[1],
  226 + 'cf_info:height': imginfo[2],
  227 + 'cf_info:size': imginfo[3],
  228 + 'cf_info:capacity': imginfo[4],
  229 + 'cf_info:quality': imginfo[5],
  230 + 'cf_info:rate': imginfo[6],
  231 + 'cf_tag:chosen': imginfo[7],
  232 + 'cf_tag:class': imginfo[8], })
  233 + except ValueError:
  234 + raise
  235 +
  236 + elif mode == 'spark':
  237 + pass
  238 + else:
  239 + raise Exception("Unknown mode!")
  240 +
  241 +
  242 + def _extract_feat(self, mode='hbase', feattype='ibd', readforward=False, writeback=False, **kwargs):
  243 + if mode == 'hbase':
  244 + if self.table == None:
  245 + self.table = self.get_table()
  246 +
  247 + if readforward:
  248 + self.dict_data = {}
  249 + cols = ['cf_pic:data',
  250 + 'cf_info:width',
  251 + 'cf_info:height',
  252 + 'cf_info:size',
  253 + 'cf_info:capacity',
  254 + 'cf_info:quality',
  255 + 'cf_info:rate',
  256 + 'cf_tag:chosen',
  257 + 'cf_tag:class']
  258 + for key, data in self.table.scan(columns=cols):
  259 + data = [data[k] for k in cols]
  260 + self.dict_data[key] = data
  261 +
  262 + for imgname, imgdata in self.dict_data.items():
  263 + try:
  264 + tmpf_src = tempfile.NamedTemporaryFile(suffix='.jpg', mode='w+b')
  265 + tmpf_src.write(imgdata[0])
  266 + tmpf_src.seek(0)
  267 +
  268 + desc = json.dumps(self._get_feat(tmpf_src.name, feattype=feattype).tolist())
  269 +
  270 + self.dict_data[imgname].append(desc)
  271 +
  272 + except Exception as e:
  273 + print e
  274 + raise
  275 + finally:
  276 + tmpf_src.close()
  277 +
  278 + if not writeback:
  279 + return self.dict_data
  280 + else:
  281 + try:
  282 + with self.table.batch(batch_size=5000) as b:
  283 + for imgname, imginfo in self.dict_data.items():
  284 + b.put(imgname,
  285 + {
  286 + 'cf_pic:data': imginfo[0],
  287 + 'cf_info:width': imginfo[1],
  288 + 'cf_info:height': imginfo[2],
  289 + 'cf_info:size': imginfo[3],
  290 + 'cf_info:capacity': imginfo[4],
  291 + 'cf_info:quality': imginfo[5],
  292 + 'cf_info:rate': imginfo[6],
  293 + 'cf_tag:chosen': imginfo[7],
  294 + 'cf_tag:class': imginfo[8],
  295 + 'cf_feat:' + feattype: imginfo[9]})
  296 + except ValueError:
  297 + raise
  298 +
  299 + elif mode == 'spark':
  300 + pass
  301 + else:
  302 + raise Exception("Unknown mode!")
  303 +
  304 +
  305 + def format(self):
  306 + self._extract_data(mode='hbase', writeback=False)
  307 + self._embed_data(mode='hbase', rate=0.1, readforward=False, writeback=False)
  308 + self._extract_feat(mode='hbase', feattype='ibd', readforward=False, writeback=True)
  309 +
  310 +
  311 + def load_data(self, mode='local', feattype='ibd', tagtype='class'):
  312 + INDEX = []
  313 + X = []
  314 + Y = []
  315 +
  316 + if mode == "local":
  317 +
  318 + dict_dataset = {}
  319 +
  320 + with open(self.list_file, 'rb') as tsvfile:
  321 + tsvfile = csv.reader(tsvfile, delimiter='\t')
  322 + for line in tsvfile:
  323 + hash = line[0]
  324 + tag = line[-1]
  325 + path_feat = os.path.join(self.feat_dir, hash[:3], hash[3:] + '.' + feattype)
  326 + if path_feat:
  327 + with open(path_feat, 'rb') as featfile:
  328 + dict_dataset[hash] = (tag, json.loads(featfile.read()))
  329 +
  330 + for tag, feat in dict_dataset.values():
  331 + X.append([item for sublist in feat for subsublist in sublist for item in subsublist])
  332 + Y.append(int(tag))
  333 +
  334 + elif mode == "remote" or mode == "hbase":
  335 + if self.table == None:
  336 + self.table = self.get_table()
  337 +
  338 + col_feat, col_tag = 'cf_feat:' + feattype, 'cf_tag:' + tagtype
  339 + for key, data in self.table.scan(columns=[col_feat, col_tag]):
  340 + X.append(json.loads(data[col_feat]))
  341 + Y.append(1 if data[col_tag] == 'True' else 0)
  342 +
  343 + elif mode == "spark" or mode == "cluster":
  344 + if self.sparkcontex == None:
  345 + self.sparkcontex = SC.Sparker(host='HPC-server', appname='ImageCV', master='spark://HPC-server:7077')
  346 +
  347 + result = self.sparkcontex.read_hbase(self.table_name) # result = {key:[feat,tag],...}
  348 + for feat, tag in result:
  349 + X.append(feat)
  350 + Y.append(tag)
  351 +
  352 + else:
  353 + raise Exception("Unknown mode!")
  354 +
  355 + return X, Y
  356 +
  357 +
  358 +
  359 +
  360 +
  361 +
  362 +
  363 +
  364 +
  365 +
  366 +
  367 +
test/test_data.py
@@ -2,7 +2,7 @@ __author__ = &#39;chunk&#39; @@ -2,7 +2,7 @@ __author__ = &#39;chunk&#39;
2 2
3 from ..common import * 3 from ..common import *
4 4
5 -from ..mdata import MSR, CV, ILSVRC 5 +from ..mdata import MSR, CV, ILSVRC, ILSVRC_S
6 6
7 7
8 def test_MSR(): 8 def test_MSR():
@@ -31,30 +31,38 @@ def test_CV(): @@ -31,30 +31,38 @@ def test_CV():
31 def test_ILSVRC(): 31 def test_ILSVRC():
32 timer = Timer() 32 timer = Timer()
33 # dil = ILSVRC.DataILSVRC(base_dir='/data/hadoop/ImageNet/ILSVRC/ILSVRC2013_DET_val', category='Train') 33 # dil = ILSVRC.DataILSVRC(base_dir='/data/hadoop/ImageNet/ILSVRC/ILSVRC2013_DET_val', category='Train')
34 - dil = ILSVRC.DataILSVRC(base_dir='/data/hadoop/ImageNet/ILSVRC/ILSVRC2013_DET_val', category='Train_1') 34 + dil = ILSVRC.DataILSVRC(base_dir='/data/hadoop/ImageNet/ILSVRC/ILSVRC2013_DET_val', category='Train_2')
35 # dil = ILSVRC.DataILSVRC(base_dir='/media/chunk/Elements/D/data/ImageNet/img/ILSVRC2013_DET_val', category='Train_1') 35 # dil = ILSVRC.DataILSVRC(base_dir='/media/chunk/Elements/D/data/ImageNet/img/ILSVRC2013_DET_val', category='Train_1')
36 36
37 dil.format() 37 dil.format()
38 - dil.embed(rate=0.1)  
39 - dil.extract_feat(feattype='ibd') 38 + # dil.embed(rate=0.1)
  39 + # dil.extract_feat(feattype='ibd')
40 # dil.extract_feat(feattype='hog') 40 # dil.extract_feat(feattype='hog')
41 41
42 timer.mark() 42 timer.mark()
43 dil.store_img() 43 dil.store_img()
44 timer.report() 44 timer.report()
45 -  
46 - timer.mark()  
47 - dil.store_tag()  
48 - timer.report()  
49 45
50 - timer.mark()  
51 - dil.store_info()  
52 - timer.report() 46 + # timer.mark()
  47 + # dil.store_tag()
  48 + # timer.report()
  49 + #
  50 + # timer.mark()
  51 + # dil.store_info()
  52 + # timer.report()
  53 + #
  54 + # timer.mark()
  55 + # dil.store_feat()
  56 + # timer.report()
53 57
54 - timer.mark()  
55 - dil.store_feat()  
56 - timer.report()  
57 58
  59 +def test_ILSVRC_S():
  60 + timer = Timer()
  61 + dils = ILSVRC_S.DataILSVRC_S(base_dir='/data/hadoop/ImageNet/ILSVRC/ILSVRC2013_DET_val', category='Train_2')
  62 +
  63 + dils._extract_data(mode='hbase', writeback=True)
  64 + dils._embed_data(mode='hbase', rate=0.1, readforward=True, writeback=True)
  65 + dils._extract_feat( mode='hbase', feattype='ibd', readforward=True, writeback=True)
58 66
59 if __name__ == '__main__': 67 if __name__ == '__main__':
60 # test_MSR() 68 # test_MSR()