Commit 24768a990fbda84a2df0b387178ce4039863d423
1 parent
f25fd27c
Exists in
master
and in
1 other branch
mode 'hbase' finished. (testing is sometimes interesting, especially when your c…
…ode is well structured and with few bugs! )
Showing
4 changed files
with
390 additions
and
374 deletions
Show diff stats
mdata/ILSVRC-S.py
@@ -1,359 +0,0 @@ | @@ -1,359 +0,0 @@ | ||
1 | -__author__ = 'chunk' | ||
2 | - | ||
3 | -from . import * | ||
4 | -from ..mfeat import HOG, IntraBlockDiff | ||
5 | -from ..mspark import SC | ||
6 | -from ..common import * | ||
7 | - | ||
8 | -import os, sys | ||
9 | -from PIL import Image | ||
10 | -from hashlib import md5 | ||
11 | -import csv | ||
12 | -import shutil | ||
13 | -import json | ||
14 | -import collections | ||
15 | -import happybase | ||
16 | - | ||
17 | -from ..mjpeg import * | ||
18 | -from ..msteg import * | ||
19 | -from ..msteg.steganography import LSB, F3, F4, F5 | ||
20 | - | ||
21 | -import numpy as np | ||
22 | -from numpy.random import randn | ||
23 | -import pandas as pd | ||
24 | -from scipy import stats | ||
25 | - | ||
26 | -from subprocess import Popen, PIPE, STDOUT | ||
27 | -import tempfile | ||
28 | - | ||
29 | -np.random.seed(sum(map(ord, "whoami"))) | ||
30 | - | ||
31 | -package_dir = os.path.dirname(os.path.abspath(__file__)) | ||
32 | - | ||
33 | - | ||
34 | -class DataILSVRCS(DataDumperBase): | ||
35 | - """ | ||
36 | - This module is specially for ILSVRC data processing under spark & hbase. | ||
37 | - | ||
38 | - We posit that the DB(e.g. HBase) has only the images data with md5 name as id. | ||
39 | - The task is to gennerate info(size,capacity,quality,etc.) and class & chosen tags, and then to perform embedding and finally to calcculate ibd features. | ||
40 | - | ||
41 | - Each step includes reading from & writing to Hbase (though PC). | ||
42 | - And each step must have a 'spark' mode option, which means that the operation is performed by spark with reading & wrting through RDDs. | ||
43 | - | ||
44 | - chunkplus@gmail.com | ||
45 | - """ | ||
46 | - | ||
47 | - def __init__(self, base_dir='/media/chunk/Elements/D/data/ImageNet/img/ILSVRC2013_DET_val', category='Train'): | ||
48 | - DataDumperBase.__init__(self, base_dir, category) | ||
49 | - | ||
50 | - self.base_dir = base_dir | ||
51 | - self.category = category | ||
52 | - | ||
53 | - self.dict_data = {} | ||
54 | - | ||
55 | - self.table_name = self.base_dir.strip('/').split('/')[-1] + '-' + self.category | ||
56 | - self.sparkcontex = None | ||
57 | - | ||
58 | - | ||
59 | - def _get_table(self): | ||
60 | - if self.table != None: | ||
61 | - return self.table | ||
62 | - | ||
63 | - if self.connection is None: | ||
64 | - c = happybase.Connection('HPC-server') | ||
65 | - self.connection = c | ||
66 | - | ||
67 | - tables = self.connection.tables() | ||
68 | - if self.table_name not in tables: | ||
69 | - families = {'cf_pic': dict(), | ||
70 | - 'cf_info': dict(max_versions=10), | ||
71 | - 'cf_tag': dict(), | ||
72 | - 'cf_feat': dict(), | ||
73 | - } | ||
74 | - self.connection.create_table(name=self.table_name, families=families) | ||
75 | - | ||
76 | - table = self.connection.table(name=self.table_name) | ||
77 | - | ||
78 | - self.table = table | ||
79 | - | ||
80 | - return table | ||
81 | - | ||
82 | - def _get_info(self, img, info_rate=None, tag_chosen=None, tag_class=None): | ||
83 | - """ | ||
84 | - Tempfile is our friend. (?) | ||
85 | - """ | ||
86 | - info_rate = info_rate if info_rate != None else 0.0 | ||
87 | - tag_chosen = tag_chosen if tag_chosen != None else stats.bernoulli.rvs(0.8) | ||
88 | - tag_class = tag_class if tag_class != None else 0 | ||
89 | - try: | ||
90 | - tmpf = tempfile.NamedTemporaryFile(suffix='.jpg', mode='w+b') | ||
91 | - tmpf.write(img) | ||
92 | - # tmpf.seek(0) | ||
93 | - im = Jpeg(tmpf.name, key=sample_key) | ||
94 | - info = [im.image_width, | ||
95 | - im.image_height, | ||
96 | - im.image_width * im.image_height, | ||
97 | - im.getCapacity(), | ||
98 | - im.getQuality(), | ||
99 | - info_rate, | ||
100 | - tag_chosen, | ||
101 | - tag_class] | ||
102 | - return info | ||
103 | - except Exception as e: | ||
104 | - print e | ||
105 | - finally: | ||
106 | - tmpf.close() | ||
107 | - | ||
108 | - def _get_feat(self, image, feattype='ibd', **kwargs): | ||
109 | - size = kwargs.get('size', (48, 48)) | ||
110 | - | ||
111 | - if feattype == 'hog': | ||
112 | - feater = HOG.FeatHOG(size=size) | ||
113 | - elif feattype == 'ibd': | ||
114 | - feater = IntraBlockDiff.FeatIntraBlockDiff() | ||
115 | - else: | ||
116 | - raise Exception("Unknown feature type!") | ||
117 | - | ||
118 | - desc = feater.feat(image) | ||
119 | - | ||
120 | - return desc | ||
121 | - | ||
122 | - def _extract_data(self, mode='hbase', writeback=False): | ||
123 | - """ | ||
124 | - Get info barely out of image data. | ||
125 | - """ | ||
126 | - if mode == 'hbase': | ||
127 | - if self.table == None: | ||
128 | - self.table = self.get_table() | ||
129 | - | ||
130 | - cols = ['cf_pic:data'] | ||
131 | - for key, data in self.table.scan(columns=cols, scan_batching=True): | ||
132 | - self.dict_data[key] = [data] + self._get_info(data) | ||
133 | - | ||
134 | - if not writeback: | ||
135 | - return self.dict_data | ||
136 | - else: | ||
137 | - try: | ||
138 | - with self.table.batch(batch_size=5000) as b: | ||
139 | - for imgname, imginfo in self.dict_data.items(): | ||
140 | - b.put(imgname, | ||
141 | - { | ||
142 | - # 'cf_pic:data': imginfo[0], | ||
143 | - 'cf_info:width': imginfo[1], | ||
144 | - 'cf_info:height': imginfo[2], | ||
145 | - 'cf_info:size': imginfo[3], | ||
146 | - 'cf_info:capacity': imginfo[4], | ||
147 | - 'cf_info:quality': imginfo[5], | ||
148 | - 'cf_info:rate': imginfo[6], | ||
149 | - 'cf_tag:chosen': imginfo[7], | ||
150 | - 'cf_tag:class': imginfo[8], }) | ||
151 | - except ValueError: | ||
152 | - raise | ||
153 | - | ||
154 | - | ||
155 | - elif mode == 'spark': | ||
156 | - pass | ||
157 | - else: | ||
158 | - raise Exception("Unknown mode!") | ||
159 | - | ||
160 | - | ||
161 | - def _embed_data(self, mode='hbase', rate=None, readforward=False, writeback=False): | ||
162 | - f5 = F5.F5(sample_key, 1) | ||
163 | - if mode == 'hbase': | ||
164 | - if self.table == None: | ||
165 | - self.table = self.get_table() | ||
166 | - | ||
167 | - if readforward: | ||
168 | - self.dict_data = {} | ||
169 | - cols = ['cf_pic:data', | ||
170 | - 'cf_info:width', | ||
171 | - 'cf_info:height', | ||
172 | - 'cf_info:size', | ||
173 | - 'cf_info:capacity', | ||
174 | - 'cf_info:quality', | ||
175 | - 'cf_info:rate', | ||
176 | - 'cf_tag:chosen', | ||
177 | - 'cf_tag:class'] | ||
178 | - for key, data in self.table.scan(columns=cols, scan_batching=True): | ||
179 | - self.dict_data[key] = data | ||
180 | - | ||
181 | - dict_data_ext = {} | ||
182 | - | ||
183 | - for imgname, imgdata in self.dict_data.items(): | ||
184 | - try: | ||
185 | - tmpf_src = tempfile.NamedTemporaryFile(suffix='.jpg', mode='w+b') | ||
186 | - tmpf_src.write(imgdata[0]) | ||
187 | - tmpf_dst = tempfile.NamedTemporaryFile(suffix='.jpg', mode='w+b') | ||
188 | - | ||
189 | - if rate == None: | ||
190 | - embed_rate = f5.embed_raw_data(tmpf_src, os.path.join(package_dir, '../res/toembed'), tmpf_dst) | ||
191 | - else: | ||
192 | - assert (rate >= 0 and rate < 1) | ||
193 | - # print capacity | ||
194 | - hidden = np.random.bytes(int(imgdata[4] * rate) / 8) | ||
195 | - embed_rate = f5.embed_raw_data(tmpf_src, hidden, tmpf_dst, frommem=True) | ||
196 | - | ||
197 | - tmpf_dst.seek(0) | ||
198 | - raw = tmpf_dst.read() | ||
199 | - index = md5(raw).hexdigest() | ||
200 | - dict_data_ext[index + '.jpg'] = [raw] + self._get_info(raw, embed_rate, 0, 1) | ||
201 | - | ||
202 | - | ||
203 | - except Exception as e: | ||
204 | - print e | ||
205 | - finally: | ||
206 | - tmpf_src.close() | ||
207 | - tmpf_dst.close() | ||
208 | - | ||
209 | - self.dict_data.update(dict_data_ext) | ||
210 | - | ||
211 | - if not writeback: | ||
212 | - return self.dict_data | ||
213 | - else: | ||
214 | - try: | ||
215 | - with self.table.batch(batch_size=5000) as b: | ||
216 | - for imgname, imginfo in dict_data_ext.items(): | ||
217 | - b.put(imgname, | ||
218 | - { | ||
219 | - 'cf_pic:data': imginfo[0], | ||
220 | - 'cf_info:width': imginfo[1], | ||
221 | - 'cf_info:height': imginfo[2], | ||
222 | - 'cf_info:size': imginfo[3], | ||
223 | - 'cf_info:capacity': imginfo[4], | ||
224 | - 'cf_info:quality': imginfo[5], | ||
225 | - 'cf_info:rate': imginfo[6], | ||
226 | - 'cf_tag:chosen': imginfo[7], | ||
227 | - 'cf_tag:class': imginfo[8], }) | ||
228 | - except ValueError: | ||
229 | - raise | ||
230 | - | ||
231 | - elif mode == 'spark': | ||
232 | - pass | ||
233 | - else: | ||
234 | - raise Exception("Unknown mode!") | ||
235 | - | ||
236 | - | ||
237 | - def _extract_feat(self, mode='hbase', feattype='ibd', readforward=False, writeback=False, **kwargs): | ||
238 | - if mode == 'hbase': | ||
239 | - if self.table == None: | ||
240 | - self.table = self.get_table() | ||
241 | - | ||
242 | - if readforward: | ||
243 | - self.dict_data = {} | ||
244 | - cols = ['cf_pic:data', | ||
245 | - 'cf_info:width', | ||
246 | - 'cf_info:height', | ||
247 | - 'cf_info:size', | ||
248 | - 'cf_info:capacity', | ||
249 | - 'cf_info:quality', | ||
250 | - 'cf_info:rate', | ||
251 | - 'cf_tag:chosen', | ||
252 | - 'cf_tag:class'] | ||
253 | - for key, data in self.table.scan(columns=cols, scan_batching=True): | ||
254 | - self.dict_data[key] = data | ||
255 | - | ||
256 | - for imgname, imgdata in self.dict_data.items(): | ||
257 | - try: | ||
258 | - tmpf_src = tempfile.NamedTemporaryFile(suffix='.jpg', mode='w+b') | ||
259 | - tmpf_src.write(imgdata[0]) | ||
260 | - | ||
261 | - desc = json.dumps(self._get_feat(tmpf_src, feattype=feattype)) | ||
262 | - | ||
263 | - self.dict_data[imgname].append(desc) | ||
264 | - | ||
265 | - except Exception as e: | ||
266 | - print e | ||
267 | - finally: | ||
268 | - tmpf_src.close() | ||
269 | - | ||
270 | - if not writeback: | ||
271 | - return self.dict_data | ||
272 | - else: | ||
273 | - try: | ||
274 | - with self.table.batch(batch_size=5000) as b: | ||
275 | - for imgname, imginfo in self.dict_data.items(): | ||
276 | - b.put(imgname, | ||
277 | - { | ||
278 | - 'cf_pic:data': imginfo[0], | ||
279 | - 'cf_info:width': imginfo[1], | ||
280 | - 'cf_info:height': imginfo[2], | ||
281 | - 'cf_info:size': imginfo[3], | ||
282 | - 'cf_info:capacity': imginfo[4], | ||
283 | - 'cf_info:quality': imginfo[5], | ||
284 | - 'cf_info:rate': imginfo[6], | ||
285 | - 'cf_tag:chosen': imginfo[7], | ||
286 | - 'cf_tag:class': imginfo[8], | ||
287 | - 'cf_feat:' + feattype: imginfo[9]}) | ||
288 | - except ValueError: | ||
289 | - raise | ||
290 | - | ||
291 | - elif mode == 'spark': | ||
292 | - pass | ||
293 | - else: | ||
294 | - raise Exception("Unknown mode!") | ||
295 | - | ||
296 | - | ||
297 | - def format(self): | ||
298 | - self._extract_data(mode='hbase', writeback=False) | ||
299 | - self._embed_data(mode='hbase', rate=0.1, readforward=False, writeback=False) | ||
300 | - self._extract_feat(mode='hbase', feattype='ibd', readforward=False, writeback=True) | ||
301 | - | ||
302 | - | ||
303 | - def load_data(self, mode='local', feattype='ibd', tagtype='class'): | ||
304 | - INDEX = [] | ||
305 | - X = [] | ||
306 | - Y = [] | ||
307 | - | ||
308 | - if mode == "local": | ||
309 | - | ||
310 | - dict_dataset = {} | ||
311 | - | ||
312 | - with open(self.list_file, 'rb') as tsvfile: | ||
313 | - tsvfile = csv.reader(tsvfile, delimiter='\t') | ||
314 | - for line in tsvfile: | ||
315 | - hash = line[0] | ||
316 | - tag = line[-1] | ||
317 | - path_feat = os.path.join(self.feat_dir, hash[:3], hash[3:] + '.' + feattype) | ||
318 | - if path_feat: | ||
319 | - with open(path_feat, 'rb') as featfile: | ||
320 | - dict_dataset[hash] = (tag, json.loads(featfile.read())) | ||
321 | - | ||
322 | - for tag, feat in dict_dataset.values(): | ||
323 | - X.append([item for sublist in feat for subsublist in sublist for item in subsublist]) | ||
324 | - Y.append(int(tag)) | ||
325 | - | ||
326 | - elif mode == "remote" or mode == "hbase": | ||
327 | - if self.table == None: | ||
328 | - self.table = self.get_table() | ||
329 | - | ||
330 | - col_feat, col_tag = 'cf_feat:' + feattype, 'cf_tag:' + tagtype | ||
331 | - for key, data in self.table.scan(columns=[col_feat, col_tag]): | ||
332 | - X.append(json.loads(data[col_feat])) | ||
333 | - Y.append(1 if data[col_tag] == 'True' else 0) | ||
334 | - | ||
335 | - elif mode == "spark" or mode == "cluster": | ||
336 | - if self.sparkcontex == None: | ||
337 | - self.sparkcontex = SC.Sparker(host='HPC-server', appname='ImageCV', master='spark://HPC-server:7077') | ||
338 | - | ||
339 | - result = self.sparkcontex.read_hbase(self.table_name) # result = {key:[feat,tag],...} | ||
340 | - for feat, tag in result: | ||
341 | - X.append(feat) | ||
342 | - Y.append(tag) | ||
343 | - | ||
344 | - else: | ||
345 | - raise Exception("Unknown mode!") | ||
346 | - | ||
347 | - return X, Y | ||
348 | - | ||
349 | - | ||
350 | - | ||
351 | - | ||
352 | - | ||
353 | - | ||
354 | - | ||
355 | - | ||
356 | - | ||
357 | - | ||
358 | - | ||
359 | - |
mdata/ILSVRC.py
@@ -302,7 +302,7 @@ class DataILSVRC(DataDumperBase): | @@ -302,7 +302,7 @@ class DataILSVRC(DataDumperBase): | ||
302 | tsvfile = csv.reader(tsvfile, delimiter='\t') | 302 | tsvfile = csv.reader(tsvfile, delimiter='\t') |
303 | for line in tsvfile: | 303 | for line in tsvfile: |
304 | path_img = os.path.join(self.img_dir, line[0][:3], line[0][3:] + '.jpg') | 304 | path_img = os.path.join(self.img_dir, line[0][:3], line[0][3:] + '.jpg') |
305 | - if path_img: | 305 | + if path_img: |
306 | with open(path_img, 'rb') as fpic: | 306 | with open(path_img, 'rb') as fpic: |
307 | dict_databuf[line[0] + '.jpg'] = fpic.read() | 307 | dict_databuf[line[0] + '.jpg'] = fpic.read() |
308 | 308 |
@@ -0,0 +1,367 @@ | @@ -0,0 +1,367 @@ | ||
1 | +__author__ = 'chunk' | ||
2 | + | ||
3 | +from . import * | ||
4 | +from ..mfeat import HOG, IntraBlockDiff | ||
5 | +from ..mspark import SC | ||
6 | +from ..common import * | ||
7 | + | ||
8 | +import os, sys | ||
9 | +from PIL import Image | ||
10 | +from hashlib import md5 | ||
11 | +import csv | ||
12 | +import shutil | ||
13 | +import json | ||
14 | +import collections | ||
15 | +import happybase | ||
16 | + | ||
17 | +from ..mjpeg import * | ||
18 | +from ..msteg import * | ||
19 | +from ..msteg.steganography import LSB, F3, F4, F5 | ||
20 | + | ||
21 | +import numpy as np | ||
22 | +from numpy.random import randn | ||
23 | +import pandas as pd | ||
24 | +from scipy import stats | ||
25 | + | ||
26 | +from subprocess import Popen, PIPE, STDOUT | ||
27 | +import tempfile | ||
28 | + | ||
29 | +np.random.seed(sum(map(ord, "whoami"))) | ||
30 | + | ||
31 | +package_dir = os.path.dirname(os.path.abspath(__file__)) | ||
32 | + | ||
33 | + | ||
34 | +class DataILSVRC_S(DataDumperBase): | ||
35 | + """ | ||
36 | + This module is specially for ILSVRC data processing under spark & hbase. | ||
37 | + | ||
38 | + We posit that the DB(e.g. HBase) has only the images data with md5 name as id. | ||
39 | + The task is to gennerate info(size,capacity,quality,etc.) and class & chosen tags, and then to perform embedding and finally to calcculate ibd features. | ||
40 | + | ||
41 | + Each step includes reading from & writing to Hbase (though PC). | ||
42 | + And each step must have a 'spark' mode option, which means that the operation is performed by spark with reading & wrting through RDDs. | ||
43 | + | ||
44 | + chunkplus@gmail.com | ||
45 | + """ | ||
46 | + | ||
47 | + def __init__(self, base_dir='/media/chunk/Elements/D/data/ImageNet/img/ILSVRC2013_DET_val', category='Train'): | ||
48 | + DataDumperBase.__init__(self, base_dir, category) | ||
49 | + | ||
50 | + self.base_dir = base_dir | ||
51 | + self.category = category | ||
52 | + | ||
53 | + self.dict_data = {} | ||
54 | + | ||
55 | + self.table_name = self.base_dir.strip('/').split('/')[-1] + '-' + self.category | ||
56 | + self.sparkcontex = None | ||
57 | + | ||
58 | + | ||
59 | + def get_table(self): | ||
60 | + if self.table != None: | ||
61 | + return self.table | ||
62 | + | ||
63 | + if self.connection is None: | ||
64 | + c = happybase.Connection('HPC-server') | ||
65 | + self.connection = c | ||
66 | + | ||
67 | + tables = self.connection.tables() | ||
68 | + if self.table_name not in tables: | ||
69 | + families = {'cf_pic': dict(), | ||
70 | + 'cf_info': dict(max_versions=10), | ||
71 | + 'cf_tag': dict(), | ||
72 | + 'cf_feat': dict(), | ||
73 | + } | ||
74 | + self.connection.create_table(name=self.table_name, families=families) | ||
75 | + | ||
76 | + table = self.connection.table(name=self.table_name) | ||
77 | + | ||
78 | + self.table = table | ||
79 | + | ||
80 | + return table | ||
81 | + | ||
82 | + def _get_info(self, img, info_rate=None, tag_chosen=None, tag_class=None): | ||
83 | + """ | ||
84 | + Tempfile is our friend. (?) | ||
85 | + """ | ||
86 | + info_rate = info_rate if info_rate != None else 0.0 | ||
87 | + tag_chosen = tag_chosen if tag_chosen != None else stats.bernoulli.rvs(0.8) | ||
88 | + tag_class = tag_class if tag_class != None else 0 | ||
89 | + try: | ||
90 | + tmpf = tempfile.NamedTemporaryFile(suffix='.jpg', mode='w+b') | ||
91 | + tmpf.write(img) | ||
92 | + tmpf.seek(0) | ||
93 | + im = Jpeg(tmpf.name, key=sample_key) | ||
94 | + info = [str(im.image_width), | ||
95 | + str(im.image_height), | ||
96 | + str(im.image_width * im.image_height), | ||
97 | + str(im.getCapacity()), | ||
98 | + str(im.getQuality()), | ||
99 | + str(info_rate), | ||
100 | + str(tag_chosen), | ||
101 | + str(tag_class)] | ||
102 | + return info | ||
103 | + except Exception as e: | ||
104 | + print e | ||
105 | + finally: | ||
106 | + tmpf.close() | ||
107 | + | ||
108 | + def _get_feat(self, image, feattype='ibd', **kwargs): | ||
109 | + size = kwargs.get('size', (48, 48)) | ||
110 | + | ||
111 | + if feattype == 'hog': | ||
112 | + feater = HOG.FeatHOG(size=size) | ||
113 | + elif feattype == 'ibd': | ||
114 | + feater = IntraBlockDiff.FeatIntraBlockDiff() | ||
115 | + else: | ||
116 | + raise Exception("Unknown feature type!") | ||
117 | + | ||
118 | + desc = feater.feat(image) | ||
119 | + | ||
120 | + return desc | ||
121 | + | ||
122 | + def _extract_data(self, mode='hbase', writeback=False): | ||
123 | + """ | ||
124 | + Get info barely out of image data. | ||
125 | + """ | ||
126 | + if mode == 'hbase': | ||
127 | + if self.table == None: | ||
128 | + self.table = self.get_table() | ||
129 | + | ||
130 | + cols = ['cf_pic:data'] | ||
131 | + for key, data in self.table.scan(columns=cols): | ||
132 | + data = data['cf_pic:data'] | ||
133 | + self.dict_data[key] = [data] + self._get_info(data) | ||
134 | + | ||
135 | + if not writeback: | ||
136 | + return self.dict_data | ||
137 | + else: | ||
138 | + try: | ||
139 | + with self.table.batch(batch_size=5000) as b: | ||
140 | + for imgname, imginfo in self.dict_data.items(): | ||
141 | + b.put(imgname, | ||
142 | + { | ||
143 | + # 'cf_pic:data': imginfo[0], | ||
144 | + 'cf_info:width': imginfo[1], | ||
145 | + 'cf_info:height': imginfo[2], | ||
146 | + 'cf_info:size': imginfo[3], | ||
147 | + 'cf_info:capacity': imginfo[4], | ||
148 | + 'cf_info:quality': imginfo[5], | ||
149 | + 'cf_info:rate': imginfo[6], | ||
150 | + 'cf_tag:chosen': imginfo[7], | ||
151 | + 'cf_tag:class': imginfo[8], | ||
152 | + }) | ||
153 | + except ValueError: | ||
154 | + raise | ||
155 | + | ||
156 | + | ||
157 | + elif mode == 'spark': | ||
158 | + pass | ||
159 | + else: | ||
160 | + raise Exception("Unknown mode!") | ||
161 | + | ||
162 | + | ||
163 | + def _embed_data(self, mode='hbase', rate=None, readforward=False, writeback=False): | ||
164 | + f5 = F5.F5(sample_key, 1) | ||
165 | + if mode == 'hbase': | ||
166 | + if self.table == None: | ||
167 | + self.table = self.get_table() | ||
168 | + | ||
169 | + if readforward: | ||
170 | + self.dict_data = {} | ||
171 | + cols = ['cf_pic:data', | ||
172 | + 'cf_info:width', | ||
173 | + 'cf_info:height', | ||
174 | + 'cf_info:size', | ||
175 | + 'cf_info:capacity', | ||
176 | + 'cf_info:quality', | ||
177 | + 'cf_info:rate', | ||
178 | + 'cf_tag:chosen', | ||
179 | + 'cf_tag:class'] | ||
180 | + for key, data in self.table.scan(columns=cols): | ||
181 | + data = [data[k] for k in cols] | ||
182 | + self.dict_data[key] = data | ||
183 | + | ||
184 | + dict_data_ext = {} | ||
185 | + | ||
186 | + for imgname, imgdata in self.dict_data.items(): | ||
187 | + try: | ||
188 | + tmpf_src = tempfile.NamedTemporaryFile(suffix='.jpg', mode='w+b') | ||
189 | + tmpf_src.write(imgdata[0]) | ||
190 | + tmpf_src.seek(0) | ||
191 | + tmpf_dst = tempfile.NamedTemporaryFile(suffix='.jpg', mode='w+b') | ||
192 | + | ||
193 | + if rate == None: | ||
194 | + embed_rate = f5.embed_raw_data(tmpf_src.name, os.path.join(package_dir, '../res/toembed'), tmpf_dst.name) | ||
195 | + else: | ||
196 | + assert (rate >= 0 and rate < 1) | ||
197 | + # print capacity | ||
198 | + hidden = np.random.bytes(int(int(imgdata[4]) * rate) / 8) | ||
199 | + embed_rate = f5.embed_raw_data(tmpf_src.name, hidden, tmpf_dst.name, frommem=True) | ||
200 | + | ||
201 | + tmpf_dst.seek(0) | ||
202 | + raw = tmpf_dst.read() | ||
203 | + index = md5(raw).hexdigest() | ||
204 | + dict_data_ext[index + '.jpg'] = [raw] + self._get_info(raw, embed_rate, 0, 1) | ||
205 | + | ||
206 | + | ||
207 | + except Exception as e: | ||
208 | + print e | ||
209 | + raise | ||
210 | + finally: | ||
211 | + tmpf_src.close() | ||
212 | + tmpf_dst.close() | ||
213 | + | ||
214 | + self.dict_data.update(dict_data_ext) | ||
215 | + | ||
216 | + if not writeback: | ||
217 | + return self.dict_data | ||
218 | + else: | ||
219 | + try: | ||
220 | + with self.table.batch(batch_size=5000) as b: | ||
221 | + for imgname, imginfo in dict_data_ext.items(): | ||
222 | + b.put(imgname, | ||
223 | + { | ||
224 | + 'cf_pic:data': imginfo[0], | ||
225 | + 'cf_info:width': imginfo[1], | ||
226 | + 'cf_info:height': imginfo[2], | ||
227 | + 'cf_info:size': imginfo[3], | ||
228 | + 'cf_info:capacity': imginfo[4], | ||
229 | + 'cf_info:quality': imginfo[5], | ||
230 | + 'cf_info:rate': imginfo[6], | ||
231 | + 'cf_tag:chosen': imginfo[7], | ||
232 | + 'cf_tag:class': imginfo[8], }) | ||
233 | + except ValueError: | ||
234 | + raise | ||
235 | + | ||
236 | + elif mode == 'spark': | ||
237 | + pass | ||
238 | + else: | ||
239 | + raise Exception("Unknown mode!") | ||
240 | + | ||
241 | + | ||
242 | + def _extract_feat(self, mode='hbase', feattype='ibd', readforward=False, writeback=False, **kwargs): | ||
243 | + if mode == 'hbase': | ||
244 | + if self.table == None: | ||
245 | + self.table = self.get_table() | ||
246 | + | ||
247 | + if readforward: | ||
248 | + self.dict_data = {} | ||
249 | + cols = ['cf_pic:data', | ||
250 | + 'cf_info:width', | ||
251 | + 'cf_info:height', | ||
252 | + 'cf_info:size', | ||
253 | + 'cf_info:capacity', | ||
254 | + 'cf_info:quality', | ||
255 | + 'cf_info:rate', | ||
256 | + 'cf_tag:chosen', | ||
257 | + 'cf_tag:class'] | ||
258 | + for key, data in self.table.scan(columns=cols): | ||
259 | + data = [data[k] for k in cols] | ||
260 | + self.dict_data[key] = data | ||
261 | + | ||
262 | + for imgname, imgdata in self.dict_data.items(): | ||
263 | + try: | ||
264 | + tmpf_src = tempfile.NamedTemporaryFile(suffix='.jpg', mode='w+b') | ||
265 | + tmpf_src.write(imgdata[0]) | ||
266 | + tmpf_src.seek(0) | ||
267 | + | ||
268 | + desc = json.dumps(self._get_feat(tmpf_src.name, feattype=feattype).tolist()) | ||
269 | + | ||
270 | + self.dict_data[imgname].append(desc) | ||
271 | + | ||
272 | + except Exception as e: | ||
273 | + print e | ||
274 | + raise | ||
275 | + finally: | ||
276 | + tmpf_src.close() | ||
277 | + | ||
278 | + if not writeback: | ||
279 | + return self.dict_data | ||
280 | + else: | ||
281 | + try: | ||
282 | + with self.table.batch(batch_size=5000) as b: | ||
283 | + for imgname, imginfo in self.dict_data.items(): | ||
284 | + b.put(imgname, | ||
285 | + { | ||
286 | + 'cf_pic:data': imginfo[0], | ||
287 | + 'cf_info:width': imginfo[1], | ||
288 | + 'cf_info:height': imginfo[2], | ||
289 | + 'cf_info:size': imginfo[3], | ||
290 | + 'cf_info:capacity': imginfo[4], | ||
291 | + 'cf_info:quality': imginfo[5], | ||
292 | + 'cf_info:rate': imginfo[6], | ||
293 | + 'cf_tag:chosen': imginfo[7], | ||
294 | + 'cf_tag:class': imginfo[8], | ||
295 | + 'cf_feat:' + feattype: imginfo[9]}) | ||
296 | + except ValueError: | ||
297 | + raise | ||
298 | + | ||
299 | + elif mode == 'spark': | ||
300 | + pass | ||
301 | + else: | ||
302 | + raise Exception("Unknown mode!") | ||
303 | + | ||
304 | + | ||
305 | + def format(self): | ||
306 | + self._extract_data(mode='hbase', writeback=False) | ||
307 | + self._embed_data(mode='hbase', rate=0.1, readforward=False, writeback=False) | ||
308 | + self._extract_feat(mode='hbase', feattype='ibd', readforward=False, writeback=True) | ||
309 | + | ||
310 | + | ||
311 | + def load_data(self, mode='local', feattype='ibd', tagtype='class'): | ||
312 | + INDEX = [] | ||
313 | + X = [] | ||
314 | + Y = [] | ||
315 | + | ||
316 | + if mode == "local": | ||
317 | + | ||
318 | + dict_dataset = {} | ||
319 | + | ||
320 | + with open(self.list_file, 'rb') as tsvfile: | ||
321 | + tsvfile = csv.reader(tsvfile, delimiter='\t') | ||
322 | + for line in tsvfile: | ||
323 | + hash = line[0] | ||
324 | + tag = line[-1] | ||
325 | + path_feat = os.path.join(self.feat_dir, hash[:3], hash[3:] + '.' + feattype) | ||
326 | + if path_feat: | ||
327 | + with open(path_feat, 'rb') as featfile: | ||
328 | + dict_dataset[hash] = (tag, json.loads(featfile.read())) | ||
329 | + | ||
330 | + for tag, feat in dict_dataset.values(): | ||
331 | + X.append([item for sublist in feat for subsublist in sublist for item in subsublist]) | ||
332 | + Y.append(int(tag)) | ||
333 | + | ||
334 | + elif mode == "remote" or mode == "hbase": | ||
335 | + if self.table == None: | ||
336 | + self.table = self.get_table() | ||
337 | + | ||
338 | + col_feat, col_tag = 'cf_feat:' + feattype, 'cf_tag:' + tagtype | ||
339 | + for key, data in self.table.scan(columns=[col_feat, col_tag]): | ||
340 | + X.append(json.loads(data[col_feat])) | ||
341 | + Y.append(1 if data[col_tag] == 'True' else 0) | ||
342 | + | ||
343 | + elif mode == "spark" or mode == "cluster": | ||
344 | + if self.sparkcontex == None: | ||
345 | + self.sparkcontex = SC.Sparker(host='HPC-server', appname='ImageCV', master='spark://HPC-server:7077') | ||
346 | + | ||
347 | + result = self.sparkcontex.read_hbase(self.table_name) # result = {key:[feat,tag],...} | ||
348 | + for feat, tag in result: | ||
349 | + X.append(feat) | ||
350 | + Y.append(tag) | ||
351 | + | ||
352 | + else: | ||
353 | + raise Exception("Unknown mode!") | ||
354 | + | ||
355 | + return X, Y | ||
356 | + | ||
357 | + | ||
358 | + | ||
359 | + | ||
360 | + | ||
361 | + | ||
362 | + | ||
363 | + | ||
364 | + | ||
365 | + | ||
366 | + | ||
367 | + |
test/test_data.py
@@ -2,7 +2,7 @@ __author__ = 'chunk' | @@ -2,7 +2,7 @@ __author__ = 'chunk' | ||
2 | 2 | ||
3 | from ..common import * | 3 | from ..common import * |
4 | 4 | ||
5 | -from ..mdata import MSR, CV, ILSVRC | 5 | +from ..mdata import MSR, CV, ILSVRC, ILSVRC_S |
6 | 6 | ||
7 | 7 | ||
8 | def test_MSR(): | 8 | def test_MSR(): |
@@ -31,30 +31,38 @@ def test_CV(): | @@ -31,30 +31,38 @@ def test_CV(): | ||
31 | def test_ILSVRC(): | 31 | def test_ILSVRC(): |
32 | timer = Timer() | 32 | timer = Timer() |
33 | # dil = ILSVRC.DataILSVRC(base_dir='/data/hadoop/ImageNet/ILSVRC/ILSVRC2013_DET_val', category='Train') | 33 | # dil = ILSVRC.DataILSVRC(base_dir='/data/hadoop/ImageNet/ILSVRC/ILSVRC2013_DET_val', category='Train') |
34 | - dil = ILSVRC.DataILSVRC(base_dir='/data/hadoop/ImageNet/ILSVRC/ILSVRC2013_DET_val', category='Train_1') | 34 | + dil = ILSVRC.DataILSVRC(base_dir='/data/hadoop/ImageNet/ILSVRC/ILSVRC2013_DET_val', category='Train_2') |
35 | # dil = ILSVRC.DataILSVRC(base_dir='/media/chunk/Elements/D/data/ImageNet/img/ILSVRC2013_DET_val', category='Train_1') | 35 | # dil = ILSVRC.DataILSVRC(base_dir='/media/chunk/Elements/D/data/ImageNet/img/ILSVRC2013_DET_val', category='Train_1') |
36 | 36 | ||
37 | dil.format() | 37 | dil.format() |
38 | - dil.embed(rate=0.1) | ||
39 | - dil.extract_feat(feattype='ibd') | 38 | + # dil.embed(rate=0.1) |
39 | + # dil.extract_feat(feattype='ibd') | ||
40 | # dil.extract_feat(feattype='hog') | 40 | # dil.extract_feat(feattype='hog') |
41 | 41 | ||
42 | timer.mark() | 42 | timer.mark() |
43 | dil.store_img() | 43 | dil.store_img() |
44 | timer.report() | 44 | timer.report() |
45 | - | ||
46 | - timer.mark() | ||
47 | - dil.store_tag() | ||
48 | - timer.report() | ||
49 | 45 | ||
50 | - timer.mark() | ||
51 | - dil.store_info() | ||
52 | - timer.report() | 46 | + # timer.mark() |
47 | + # dil.store_tag() | ||
48 | + # timer.report() | ||
49 | + # | ||
50 | + # timer.mark() | ||
51 | + # dil.store_info() | ||
52 | + # timer.report() | ||
53 | + # | ||
54 | + # timer.mark() | ||
55 | + # dil.store_feat() | ||
56 | + # timer.report() | ||
53 | 57 | ||
54 | - timer.mark() | ||
55 | - dil.store_feat() | ||
56 | - timer.report() | ||
57 | 58 | ||
59 | +def test_ILSVRC_S(): | ||
60 | + timer = Timer() | ||
61 | + dils = ILSVRC_S.DataILSVRC_S(base_dir='/data/hadoop/ImageNet/ILSVRC/ILSVRC2013_DET_val', category='Train_2') | ||
62 | + | ||
63 | + dils._extract_data(mode='hbase', writeback=True) | ||
64 | + dils._embed_data(mode='hbase', rate=0.1, readforward=True, writeback=True) | ||
65 | + dils._extract_feat( mode='hbase', feattype='ibd', readforward=True, writeback=True) | ||
58 | 66 | ||
59 | if __name__ == '__main__': | 67 | if __name__ == '__main__': |
60 | # test_MSR() | 68 | # test_MSR() |