Commit 24768a990fbda84a2df0b387178ce4039863d423
1 parent
f25fd27c
Exists in
master
and in
1 other branch
mode 'hbase' finished. (testing is sometimes interesting, especially when your c…
…ode is well structured and with few bugs! )
Showing
4 changed files
with
390 additions
and
374 deletions
Show diff stats
mdata/ILSVRC-S.py
... | ... | @@ -1,359 +0,0 @@ |
1 | -__author__ = 'chunk' | |
2 | - | |
3 | -from . import * | |
4 | -from ..mfeat import HOG, IntraBlockDiff | |
5 | -from ..mspark import SC | |
6 | -from ..common import * | |
7 | - | |
8 | -import os, sys | |
9 | -from PIL import Image | |
10 | -from hashlib import md5 | |
11 | -import csv | |
12 | -import shutil | |
13 | -import json | |
14 | -import collections | |
15 | -import happybase | |
16 | - | |
17 | -from ..mjpeg import * | |
18 | -from ..msteg import * | |
19 | -from ..msteg.steganography import LSB, F3, F4, F5 | |
20 | - | |
21 | -import numpy as np | |
22 | -from numpy.random import randn | |
23 | -import pandas as pd | |
24 | -from scipy import stats | |
25 | - | |
26 | -from subprocess import Popen, PIPE, STDOUT | |
27 | -import tempfile | |
28 | - | |
29 | -np.random.seed(sum(map(ord, "whoami"))) | |
30 | - | |
31 | -package_dir = os.path.dirname(os.path.abspath(__file__)) | |
32 | - | |
33 | - | |
34 | -class DataILSVRCS(DataDumperBase): | |
35 | - """ | |
36 | - This module is specially for ILSVRC data processing under spark & hbase. | |
37 | - | |
38 | - We posit that the DB(e.g. HBase) has only the images data with md5 name as id. | |
39 | - The task is to gennerate info(size,capacity,quality,etc.) and class & chosen tags, and then to perform embedding and finally to calcculate ibd features. | |
40 | - | |
41 | - Each step includes reading from & writing to Hbase (though PC). | |
42 | - And each step must have a 'spark' mode option, which means that the operation is performed by spark with reading & wrting through RDDs. | |
43 | - | |
44 | - chunkplus@gmail.com | |
45 | - """ | |
46 | - | |
47 | - def __init__(self, base_dir='/media/chunk/Elements/D/data/ImageNet/img/ILSVRC2013_DET_val', category='Train'): | |
48 | - DataDumperBase.__init__(self, base_dir, category) | |
49 | - | |
50 | - self.base_dir = base_dir | |
51 | - self.category = category | |
52 | - | |
53 | - self.dict_data = {} | |
54 | - | |
55 | - self.table_name = self.base_dir.strip('/').split('/')[-1] + '-' + self.category | |
56 | - self.sparkcontex = None | |
57 | - | |
58 | - | |
59 | - def _get_table(self): | |
60 | - if self.table != None: | |
61 | - return self.table | |
62 | - | |
63 | - if self.connection is None: | |
64 | - c = happybase.Connection('HPC-server') | |
65 | - self.connection = c | |
66 | - | |
67 | - tables = self.connection.tables() | |
68 | - if self.table_name not in tables: | |
69 | - families = {'cf_pic': dict(), | |
70 | - 'cf_info': dict(max_versions=10), | |
71 | - 'cf_tag': dict(), | |
72 | - 'cf_feat': dict(), | |
73 | - } | |
74 | - self.connection.create_table(name=self.table_name, families=families) | |
75 | - | |
76 | - table = self.connection.table(name=self.table_name) | |
77 | - | |
78 | - self.table = table | |
79 | - | |
80 | - return table | |
81 | - | |
82 | - def _get_info(self, img, info_rate=None, tag_chosen=None, tag_class=None): | |
83 | - """ | |
84 | - Tempfile is our friend. (?) | |
85 | - """ | |
86 | - info_rate = info_rate if info_rate != None else 0.0 | |
87 | - tag_chosen = tag_chosen if tag_chosen != None else stats.bernoulli.rvs(0.8) | |
88 | - tag_class = tag_class if tag_class != None else 0 | |
89 | - try: | |
90 | - tmpf = tempfile.NamedTemporaryFile(suffix='.jpg', mode='w+b') | |
91 | - tmpf.write(img) | |
92 | - # tmpf.seek(0) | |
93 | - im = Jpeg(tmpf.name, key=sample_key) | |
94 | - info = [im.image_width, | |
95 | - im.image_height, | |
96 | - im.image_width * im.image_height, | |
97 | - im.getCapacity(), | |
98 | - im.getQuality(), | |
99 | - info_rate, | |
100 | - tag_chosen, | |
101 | - tag_class] | |
102 | - return info | |
103 | - except Exception as e: | |
104 | - print e | |
105 | - finally: | |
106 | - tmpf.close() | |
107 | - | |
108 | - def _get_feat(self, image, feattype='ibd', **kwargs): | |
109 | - size = kwargs.get('size', (48, 48)) | |
110 | - | |
111 | - if feattype == 'hog': | |
112 | - feater = HOG.FeatHOG(size=size) | |
113 | - elif feattype == 'ibd': | |
114 | - feater = IntraBlockDiff.FeatIntraBlockDiff() | |
115 | - else: | |
116 | - raise Exception("Unknown feature type!") | |
117 | - | |
118 | - desc = feater.feat(image) | |
119 | - | |
120 | - return desc | |
121 | - | |
122 | - def _extract_data(self, mode='hbase', writeback=False): | |
123 | - """ | |
124 | - Get info barely out of image data. | |
125 | - """ | |
126 | - if mode == 'hbase': | |
127 | - if self.table == None: | |
128 | - self.table = self.get_table() | |
129 | - | |
130 | - cols = ['cf_pic:data'] | |
131 | - for key, data in self.table.scan(columns=cols, scan_batching=True): | |
132 | - self.dict_data[key] = [data] + self._get_info(data) | |
133 | - | |
134 | - if not writeback: | |
135 | - return self.dict_data | |
136 | - else: | |
137 | - try: | |
138 | - with self.table.batch(batch_size=5000) as b: | |
139 | - for imgname, imginfo in self.dict_data.items(): | |
140 | - b.put(imgname, | |
141 | - { | |
142 | - # 'cf_pic:data': imginfo[0], | |
143 | - 'cf_info:width': imginfo[1], | |
144 | - 'cf_info:height': imginfo[2], | |
145 | - 'cf_info:size': imginfo[3], | |
146 | - 'cf_info:capacity': imginfo[4], | |
147 | - 'cf_info:quality': imginfo[5], | |
148 | - 'cf_info:rate': imginfo[6], | |
149 | - 'cf_tag:chosen': imginfo[7], | |
150 | - 'cf_tag:class': imginfo[8], }) | |
151 | - except ValueError: | |
152 | - raise | |
153 | - | |
154 | - | |
155 | - elif mode == 'spark': | |
156 | - pass | |
157 | - else: | |
158 | - raise Exception("Unknown mode!") | |
159 | - | |
160 | - | |
161 | - def _embed_data(self, mode='hbase', rate=None, readforward=False, writeback=False): | |
162 | - f5 = F5.F5(sample_key, 1) | |
163 | - if mode == 'hbase': | |
164 | - if self.table == None: | |
165 | - self.table = self.get_table() | |
166 | - | |
167 | - if readforward: | |
168 | - self.dict_data = {} | |
169 | - cols = ['cf_pic:data', | |
170 | - 'cf_info:width', | |
171 | - 'cf_info:height', | |
172 | - 'cf_info:size', | |
173 | - 'cf_info:capacity', | |
174 | - 'cf_info:quality', | |
175 | - 'cf_info:rate', | |
176 | - 'cf_tag:chosen', | |
177 | - 'cf_tag:class'] | |
178 | - for key, data in self.table.scan(columns=cols, scan_batching=True): | |
179 | - self.dict_data[key] = data | |
180 | - | |
181 | - dict_data_ext = {} | |
182 | - | |
183 | - for imgname, imgdata in self.dict_data.items(): | |
184 | - try: | |
185 | - tmpf_src = tempfile.NamedTemporaryFile(suffix='.jpg', mode='w+b') | |
186 | - tmpf_src.write(imgdata[0]) | |
187 | - tmpf_dst = tempfile.NamedTemporaryFile(suffix='.jpg', mode='w+b') | |
188 | - | |
189 | - if rate == None: | |
190 | - embed_rate = f5.embed_raw_data(tmpf_src, os.path.join(package_dir, '../res/toembed'), tmpf_dst) | |
191 | - else: | |
192 | - assert (rate >= 0 and rate < 1) | |
193 | - # print capacity | |
194 | - hidden = np.random.bytes(int(imgdata[4] * rate) / 8) | |
195 | - embed_rate = f5.embed_raw_data(tmpf_src, hidden, tmpf_dst, frommem=True) | |
196 | - | |
197 | - tmpf_dst.seek(0) | |
198 | - raw = tmpf_dst.read() | |
199 | - index = md5(raw).hexdigest() | |
200 | - dict_data_ext[index + '.jpg'] = [raw] + self._get_info(raw, embed_rate, 0, 1) | |
201 | - | |
202 | - | |
203 | - except Exception as e: | |
204 | - print e | |
205 | - finally: | |
206 | - tmpf_src.close() | |
207 | - tmpf_dst.close() | |
208 | - | |
209 | - self.dict_data.update(dict_data_ext) | |
210 | - | |
211 | - if not writeback: | |
212 | - return self.dict_data | |
213 | - else: | |
214 | - try: | |
215 | - with self.table.batch(batch_size=5000) as b: | |
216 | - for imgname, imginfo in dict_data_ext.items(): | |
217 | - b.put(imgname, | |
218 | - { | |
219 | - 'cf_pic:data': imginfo[0], | |
220 | - 'cf_info:width': imginfo[1], | |
221 | - 'cf_info:height': imginfo[2], | |
222 | - 'cf_info:size': imginfo[3], | |
223 | - 'cf_info:capacity': imginfo[4], | |
224 | - 'cf_info:quality': imginfo[5], | |
225 | - 'cf_info:rate': imginfo[6], | |
226 | - 'cf_tag:chosen': imginfo[7], | |
227 | - 'cf_tag:class': imginfo[8], }) | |
228 | - except ValueError: | |
229 | - raise | |
230 | - | |
231 | - elif mode == 'spark': | |
232 | - pass | |
233 | - else: | |
234 | - raise Exception("Unknown mode!") | |
235 | - | |
236 | - | |
237 | - def _extract_feat(self, mode='hbase', feattype='ibd', readforward=False, writeback=False, **kwargs): | |
238 | - if mode == 'hbase': | |
239 | - if self.table == None: | |
240 | - self.table = self.get_table() | |
241 | - | |
242 | - if readforward: | |
243 | - self.dict_data = {} | |
244 | - cols = ['cf_pic:data', | |
245 | - 'cf_info:width', | |
246 | - 'cf_info:height', | |
247 | - 'cf_info:size', | |
248 | - 'cf_info:capacity', | |
249 | - 'cf_info:quality', | |
250 | - 'cf_info:rate', | |
251 | - 'cf_tag:chosen', | |
252 | - 'cf_tag:class'] | |
253 | - for key, data in self.table.scan(columns=cols, scan_batching=True): | |
254 | - self.dict_data[key] = data | |
255 | - | |
256 | - for imgname, imgdata in self.dict_data.items(): | |
257 | - try: | |
258 | - tmpf_src = tempfile.NamedTemporaryFile(suffix='.jpg', mode='w+b') | |
259 | - tmpf_src.write(imgdata[0]) | |
260 | - | |
261 | - desc = json.dumps(self._get_feat(tmpf_src, feattype=feattype)) | |
262 | - | |
263 | - self.dict_data[imgname].append(desc) | |
264 | - | |
265 | - except Exception as e: | |
266 | - print e | |
267 | - finally: | |
268 | - tmpf_src.close() | |
269 | - | |
270 | - if not writeback: | |
271 | - return self.dict_data | |
272 | - else: | |
273 | - try: | |
274 | - with self.table.batch(batch_size=5000) as b: | |
275 | - for imgname, imginfo in self.dict_data.items(): | |
276 | - b.put(imgname, | |
277 | - { | |
278 | - 'cf_pic:data': imginfo[0], | |
279 | - 'cf_info:width': imginfo[1], | |
280 | - 'cf_info:height': imginfo[2], | |
281 | - 'cf_info:size': imginfo[3], | |
282 | - 'cf_info:capacity': imginfo[4], | |
283 | - 'cf_info:quality': imginfo[5], | |
284 | - 'cf_info:rate': imginfo[6], | |
285 | - 'cf_tag:chosen': imginfo[7], | |
286 | - 'cf_tag:class': imginfo[8], | |
287 | - 'cf_feat:' + feattype: imginfo[9]}) | |
288 | - except ValueError: | |
289 | - raise | |
290 | - | |
291 | - elif mode == 'spark': | |
292 | - pass | |
293 | - else: | |
294 | - raise Exception("Unknown mode!") | |
295 | - | |
296 | - | |
297 | - def format(self): | |
298 | - self._extract_data(mode='hbase', writeback=False) | |
299 | - self._embed_data(mode='hbase', rate=0.1, readforward=False, writeback=False) | |
300 | - self._extract_feat(mode='hbase', feattype='ibd', readforward=False, writeback=True) | |
301 | - | |
302 | - | |
303 | - def load_data(self, mode='local', feattype='ibd', tagtype='class'): | |
304 | - INDEX = [] | |
305 | - X = [] | |
306 | - Y = [] | |
307 | - | |
308 | - if mode == "local": | |
309 | - | |
310 | - dict_dataset = {} | |
311 | - | |
312 | - with open(self.list_file, 'rb') as tsvfile: | |
313 | - tsvfile = csv.reader(tsvfile, delimiter='\t') | |
314 | - for line in tsvfile: | |
315 | - hash = line[0] | |
316 | - tag = line[-1] | |
317 | - path_feat = os.path.join(self.feat_dir, hash[:3], hash[3:] + '.' + feattype) | |
318 | - if path_feat: | |
319 | - with open(path_feat, 'rb') as featfile: | |
320 | - dict_dataset[hash] = (tag, json.loads(featfile.read())) | |
321 | - | |
322 | - for tag, feat in dict_dataset.values(): | |
323 | - X.append([item for sublist in feat for subsublist in sublist for item in subsublist]) | |
324 | - Y.append(int(tag)) | |
325 | - | |
326 | - elif mode == "remote" or mode == "hbase": | |
327 | - if self.table == None: | |
328 | - self.table = self.get_table() | |
329 | - | |
330 | - col_feat, col_tag = 'cf_feat:' + feattype, 'cf_tag:' + tagtype | |
331 | - for key, data in self.table.scan(columns=[col_feat, col_tag]): | |
332 | - X.append(json.loads(data[col_feat])) | |
333 | - Y.append(1 if data[col_tag] == 'True' else 0) | |
334 | - | |
335 | - elif mode == "spark" or mode == "cluster": | |
336 | - if self.sparkcontex == None: | |
337 | - self.sparkcontex = SC.Sparker(host='HPC-server', appname='ImageCV', master='spark://HPC-server:7077') | |
338 | - | |
339 | - result = self.sparkcontex.read_hbase(self.table_name) # result = {key:[feat,tag],...} | |
340 | - for feat, tag in result: | |
341 | - X.append(feat) | |
342 | - Y.append(tag) | |
343 | - | |
344 | - else: | |
345 | - raise Exception("Unknown mode!") | |
346 | - | |
347 | - return X, Y | |
348 | - | |
349 | - | |
350 | - | |
351 | - | |
352 | - | |
353 | - | |
354 | - | |
355 | - | |
356 | - | |
357 | - | |
358 | - | |
359 | - |
mdata/ILSVRC.py
... | ... | @@ -302,7 +302,7 @@ class DataILSVRC(DataDumperBase): |
302 | 302 | tsvfile = csv.reader(tsvfile, delimiter='\t') |
303 | 303 | for line in tsvfile: |
304 | 304 | path_img = os.path.join(self.img_dir, line[0][:3], line[0][3:] + '.jpg') |
305 | - if path_img: | |
305 | + if path_img: | |
306 | 306 | with open(path_img, 'rb') as fpic: |
307 | 307 | dict_databuf[line[0] + '.jpg'] = fpic.read() |
308 | 308 | ... | ... |
... | ... | @@ -0,0 +1,367 @@ |
1 | +__author__ = 'chunk' | |
2 | + | |
3 | +from . import * | |
4 | +from ..mfeat import HOG, IntraBlockDiff | |
5 | +from ..mspark import SC | |
6 | +from ..common import * | |
7 | + | |
8 | +import os, sys | |
9 | +from PIL import Image | |
10 | +from hashlib import md5 | |
11 | +import csv | |
12 | +import shutil | |
13 | +import json | |
14 | +import collections | |
15 | +import happybase | |
16 | + | |
17 | +from ..mjpeg import * | |
18 | +from ..msteg import * | |
19 | +from ..msteg.steganography import LSB, F3, F4, F5 | |
20 | + | |
21 | +import numpy as np | |
22 | +from numpy.random import randn | |
23 | +import pandas as pd | |
24 | +from scipy import stats | |
25 | + | |
26 | +from subprocess import Popen, PIPE, STDOUT | |
27 | +import tempfile | |
28 | + | |
29 | +np.random.seed(sum(map(ord, "whoami"))) | |
30 | + | |
31 | +package_dir = os.path.dirname(os.path.abspath(__file__)) | |
32 | + | |
33 | + | |
34 | +class DataILSVRC_S(DataDumperBase): | |
35 | + """ | |
36 | + This module is specially for ILSVRC data processing under spark & hbase. | |
37 | + | |
38 | + We posit that the DB(e.g. HBase) has only the images data with md5 name as id. | |
39 | + The task is to gennerate info(size,capacity,quality,etc.) and class & chosen tags, and then to perform embedding and finally to calcculate ibd features. | |
40 | + | |
41 | + Each step includes reading from & writing to Hbase (though PC). | |
42 | + And each step must have a 'spark' mode option, which means that the operation is performed by spark with reading & wrting through RDDs. | |
43 | + | |
44 | + chunkplus@gmail.com | |
45 | + """ | |
46 | + | |
47 | + def __init__(self, base_dir='/media/chunk/Elements/D/data/ImageNet/img/ILSVRC2013_DET_val', category='Train'): | |
48 | + DataDumperBase.__init__(self, base_dir, category) | |
49 | + | |
50 | + self.base_dir = base_dir | |
51 | + self.category = category | |
52 | + | |
53 | + self.dict_data = {} | |
54 | + | |
55 | + self.table_name = self.base_dir.strip('/').split('/')[-1] + '-' + self.category | |
56 | + self.sparkcontex = None | |
57 | + | |
58 | + | |
59 | + def get_table(self): | |
60 | + if self.table != None: | |
61 | + return self.table | |
62 | + | |
63 | + if self.connection is None: | |
64 | + c = happybase.Connection('HPC-server') | |
65 | + self.connection = c | |
66 | + | |
67 | + tables = self.connection.tables() | |
68 | + if self.table_name not in tables: | |
69 | + families = {'cf_pic': dict(), | |
70 | + 'cf_info': dict(max_versions=10), | |
71 | + 'cf_tag': dict(), | |
72 | + 'cf_feat': dict(), | |
73 | + } | |
74 | + self.connection.create_table(name=self.table_name, families=families) | |
75 | + | |
76 | + table = self.connection.table(name=self.table_name) | |
77 | + | |
78 | + self.table = table | |
79 | + | |
80 | + return table | |
81 | + | |
82 | + def _get_info(self, img, info_rate=None, tag_chosen=None, tag_class=None): | |
83 | + """ | |
84 | + Tempfile is our friend. (?) | |
85 | + """ | |
86 | + info_rate = info_rate if info_rate != None else 0.0 | |
87 | + tag_chosen = tag_chosen if tag_chosen != None else stats.bernoulli.rvs(0.8) | |
88 | + tag_class = tag_class if tag_class != None else 0 | |
89 | + try: | |
90 | + tmpf = tempfile.NamedTemporaryFile(suffix='.jpg', mode='w+b') | |
91 | + tmpf.write(img) | |
92 | + tmpf.seek(0) | |
93 | + im = Jpeg(tmpf.name, key=sample_key) | |
94 | + info = [str(im.image_width), | |
95 | + str(im.image_height), | |
96 | + str(im.image_width * im.image_height), | |
97 | + str(im.getCapacity()), | |
98 | + str(im.getQuality()), | |
99 | + str(info_rate), | |
100 | + str(tag_chosen), | |
101 | + str(tag_class)] | |
102 | + return info | |
103 | + except Exception as e: | |
104 | + print e | |
105 | + finally: | |
106 | + tmpf.close() | |
107 | + | |
108 | + def _get_feat(self, image, feattype='ibd', **kwargs): | |
109 | + size = kwargs.get('size', (48, 48)) | |
110 | + | |
111 | + if feattype == 'hog': | |
112 | + feater = HOG.FeatHOG(size=size) | |
113 | + elif feattype == 'ibd': | |
114 | + feater = IntraBlockDiff.FeatIntraBlockDiff() | |
115 | + else: | |
116 | + raise Exception("Unknown feature type!") | |
117 | + | |
118 | + desc = feater.feat(image) | |
119 | + | |
120 | + return desc | |
121 | + | |
122 | + def _extract_data(self, mode='hbase', writeback=False): | |
123 | + """ | |
124 | + Get info barely out of image data. | |
125 | + """ | |
126 | + if mode == 'hbase': | |
127 | + if self.table == None: | |
128 | + self.table = self.get_table() | |
129 | + | |
130 | + cols = ['cf_pic:data'] | |
131 | + for key, data in self.table.scan(columns=cols): | |
132 | + data = data['cf_pic:data'] | |
133 | + self.dict_data[key] = [data] + self._get_info(data) | |
134 | + | |
135 | + if not writeback: | |
136 | + return self.dict_data | |
137 | + else: | |
138 | + try: | |
139 | + with self.table.batch(batch_size=5000) as b: | |
140 | + for imgname, imginfo in self.dict_data.items(): | |
141 | + b.put(imgname, | |
142 | + { | |
143 | + # 'cf_pic:data': imginfo[0], | |
144 | + 'cf_info:width': imginfo[1], | |
145 | + 'cf_info:height': imginfo[2], | |
146 | + 'cf_info:size': imginfo[3], | |
147 | + 'cf_info:capacity': imginfo[4], | |
148 | + 'cf_info:quality': imginfo[5], | |
149 | + 'cf_info:rate': imginfo[6], | |
150 | + 'cf_tag:chosen': imginfo[7], | |
151 | + 'cf_tag:class': imginfo[8], | |
152 | + }) | |
153 | + except ValueError: | |
154 | + raise | |
155 | + | |
156 | + | |
157 | + elif mode == 'spark': | |
158 | + pass | |
159 | + else: | |
160 | + raise Exception("Unknown mode!") | |
161 | + | |
162 | + | |
163 | + def _embed_data(self, mode='hbase', rate=None, readforward=False, writeback=False): | |
164 | + f5 = F5.F5(sample_key, 1) | |
165 | + if mode == 'hbase': | |
166 | + if self.table == None: | |
167 | + self.table = self.get_table() | |
168 | + | |
169 | + if readforward: | |
170 | + self.dict_data = {} | |
171 | + cols = ['cf_pic:data', | |
172 | + 'cf_info:width', | |
173 | + 'cf_info:height', | |
174 | + 'cf_info:size', | |
175 | + 'cf_info:capacity', | |
176 | + 'cf_info:quality', | |
177 | + 'cf_info:rate', | |
178 | + 'cf_tag:chosen', | |
179 | + 'cf_tag:class'] | |
180 | + for key, data in self.table.scan(columns=cols): | |
181 | + data = [data[k] for k in cols] | |
182 | + self.dict_data[key] = data | |
183 | + | |
184 | + dict_data_ext = {} | |
185 | + | |
186 | + for imgname, imgdata in self.dict_data.items(): | |
187 | + try: | |
188 | + tmpf_src = tempfile.NamedTemporaryFile(suffix='.jpg', mode='w+b') | |
189 | + tmpf_src.write(imgdata[0]) | |
190 | + tmpf_src.seek(0) | |
191 | + tmpf_dst = tempfile.NamedTemporaryFile(suffix='.jpg', mode='w+b') | |
192 | + | |
193 | + if rate == None: | |
194 | + embed_rate = f5.embed_raw_data(tmpf_src.name, os.path.join(package_dir, '../res/toembed'), tmpf_dst.name) | |
195 | + else: | |
196 | + assert (rate >= 0 and rate < 1) | |
197 | + # print capacity | |
198 | + hidden = np.random.bytes(int(int(imgdata[4]) * rate) / 8) | |
199 | + embed_rate = f5.embed_raw_data(tmpf_src.name, hidden, tmpf_dst.name, frommem=True) | |
200 | + | |
201 | + tmpf_dst.seek(0) | |
202 | + raw = tmpf_dst.read() | |
203 | + index = md5(raw).hexdigest() | |
204 | + dict_data_ext[index + '.jpg'] = [raw] + self._get_info(raw, embed_rate, 0, 1) | |
205 | + | |
206 | + | |
207 | + except Exception as e: | |
208 | + print e | |
209 | + raise | |
210 | + finally: | |
211 | + tmpf_src.close() | |
212 | + tmpf_dst.close() | |
213 | + | |
214 | + self.dict_data.update(dict_data_ext) | |
215 | + | |
216 | + if not writeback: | |
217 | + return self.dict_data | |
218 | + else: | |
219 | + try: | |
220 | + with self.table.batch(batch_size=5000) as b: | |
221 | + for imgname, imginfo in dict_data_ext.items(): | |
222 | + b.put(imgname, | |
223 | + { | |
224 | + 'cf_pic:data': imginfo[0], | |
225 | + 'cf_info:width': imginfo[1], | |
226 | + 'cf_info:height': imginfo[2], | |
227 | + 'cf_info:size': imginfo[3], | |
228 | + 'cf_info:capacity': imginfo[4], | |
229 | + 'cf_info:quality': imginfo[5], | |
230 | + 'cf_info:rate': imginfo[6], | |
231 | + 'cf_tag:chosen': imginfo[7], | |
232 | + 'cf_tag:class': imginfo[8], }) | |
233 | + except ValueError: | |
234 | + raise | |
235 | + | |
236 | + elif mode == 'spark': | |
237 | + pass | |
238 | + else: | |
239 | + raise Exception("Unknown mode!") | |
240 | + | |
241 | + | |
242 | + def _extract_feat(self, mode='hbase', feattype='ibd', readforward=False, writeback=False, **kwargs): | |
243 | + if mode == 'hbase': | |
244 | + if self.table == None: | |
245 | + self.table = self.get_table() | |
246 | + | |
247 | + if readforward: | |
248 | + self.dict_data = {} | |
249 | + cols = ['cf_pic:data', | |
250 | + 'cf_info:width', | |
251 | + 'cf_info:height', | |
252 | + 'cf_info:size', | |
253 | + 'cf_info:capacity', | |
254 | + 'cf_info:quality', | |
255 | + 'cf_info:rate', | |
256 | + 'cf_tag:chosen', | |
257 | + 'cf_tag:class'] | |
258 | + for key, data in self.table.scan(columns=cols): | |
259 | + data = [data[k] for k in cols] | |
260 | + self.dict_data[key] = data | |
261 | + | |
262 | + for imgname, imgdata in self.dict_data.items(): | |
263 | + try: | |
264 | + tmpf_src = tempfile.NamedTemporaryFile(suffix='.jpg', mode='w+b') | |
265 | + tmpf_src.write(imgdata[0]) | |
266 | + tmpf_src.seek(0) | |
267 | + | |
268 | + desc = json.dumps(self._get_feat(tmpf_src.name, feattype=feattype).tolist()) | |
269 | + | |
270 | + self.dict_data[imgname].append(desc) | |
271 | + | |
272 | + except Exception as e: | |
273 | + print e | |
274 | + raise | |
275 | + finally: | |
276 | + tmpf_src.close() | |
277 | + | |
278 | + if not writeback: | |
279 | + return self.dict_data | |
280 | + else: | |
281 | + try: | |
282 | + with self.table.batch(batch_size=5000) as b: | |
283 | + for imgname, imginfo in self.dict_data.items(): | |
284 | + b.put(imgname, | |
285 | + { | |
286 | + 'cf_pic:data': imginfo[0], | |
287 | + 'cf_info:width': imginfo[1], | |
288 | + 'cf_info:height': imginfo[2], | |
289 | + 'cf_info:size': imginfo[3], | |
290 | + 'cf_info:capacity': imginfo[4], | |
291 | + 'cf_info:quality': imginfo[5], | |
292 | + 'cf_info:rate': imginfo[6], | |
293 | + 'cf_tag:chosen': imginfo[7], | |
294 | + 'cf_tag:class': imginfo[8], | |
295 | + 'cf_feat:' + feattype: imginfo[9]}) | |
296 | + except ValueError: | |
297 | + raise | |
298 | + | |
299 | + elif mode == 'spark': | |
300 | + pass | |
301 | + else: | |
302 | + raise Exception("Unknown mode!") | |
303 | + | |
304 | + | |
305 | + def format(self): | |
306 | + self._extract_data(mode='hbase', writeback=False) | |
307 | + self._embed_data(mode='hbase', rate=0.1, readforward=False, writeback=False) | |
308 | + self._extract_feat(mode='hbase', feattype='ibd', readforward=False, writeback=True) | |
309 | + | |
310 | + | |
311 | + def load_data(self, mode='local', feattype='ibd', tagtype='class'): | |
312 | + INDEX = [] | |
313 | + X = [] | |
314 | + Y = [] | |
315 | + | |
316 | + if mode == "local": | |
317 | + | |
318 | + dict_dataset = {} | |
319 | + | |
320 | + with open(self.list_file, 'rb') as tsvfile: | |
321 | + tsvfile = csv.reader(tsvfile, delimiter='\t') | |
322 | + for line in tsvfile: | |
323 | + hash = line[0] | |
324 | + tag = line[-1] | |
325 | + path_feat = os.path.join(self.feat_dir, hash[:3], hash[3:] + '.' + feattype) | |
326 | + if path_feat: | |
327 | + with open(path_feat, 'rb') as featfile: | |
328 | + dict_dataset[hash] = (tag, json.loads(featfile.read())) | |
329 | + | |
330 | + for tag, feat in dict_dataset.values(): | |
331 | + X.append([item for sublist in feat for subsublist in sublist for item in subsublist]) | |
332 | + Y.append(int(tag)) | |
333 | + | |
334 | + elif mode == "remote" or mode == "hbase": | |
335 | + if self.table == None: | |
336 | + self.table = self.get_table() | |
337 | + | |
338 | + col_feat, col_tag = 'cf_feat:' + feattype, 'cf_tag:' + tagtype | |
339 | + for key, data in self.table.scan(columns=[col_feat, col_tag]): | |
340 | + X.append(json.loads(data[col_feat])) | |
341 | + Y.append(1 if data[col_tag] == 'True' else 0) | |
342 | + | |
343 | + elif mode == "spark" or mode == "cluster": | |
344 | + if self.sparkcontex == None: | |
345 | + self.sparkcontex = SC.Sparker(host='HPC-server', appname='ImageCV', master='spark://HPC-server:7077') | |
346 | + | |
347 | + result = self.sparkcontex.read_hbase(self.table_name) # result = {key:[feat,tag],...} | |
348 | + for feat, tag in result: | |
349 | + X.append(feat) | |
350 | + Y.append(tag) | |
351 | + | |
352 | + else: | |
353 | + raise Exception("Unknown mode!") | |
354 | + | |
355 | + return X, Y | |
356 | + | |
357 | + | |
358 | + | |
359 | + | |
360 | + | |
361 | + | |
362 | + | |
363 | + | |
364 | + | |
365 | + | |
366 | + | |
367 | + | ... | ... |
test/test_data.py
... | ... | @@ -2,7 +2,7 @@ __author__ = 'chunk' |
2 | 2 | |
3 | 3 | from ..common import * |
4 | 4 | |
5 | -from ..mdata import MSR, CV, ILSVRC | |
5 | +from ..mdata import MSR, CV, ILSVRC, ILSVRC_S | |
6 | 6 | |
7 | 7 | |
8 | 8 | def test_MSR(): |
... | ... | @@ -31,30 +31,38 @@ def test_CV(): |
31 | 31 | def test_ILSVRC(): |
32 | 32 | timer = Timer() |
33 | 33 | # dil = ILSVRC.DataILSVRC(base_dir='/data/hadoop/ImageNet/ILSVRC/ILSVRC2013_DET_val', category='Train') |
34 | - dil = ILSVRC.DataILSVRC(base_dir='/data/hadoop/ImageNet/ILSVRC/ILSVRC2013_DET_val', category='Train_1') | |
34 | + dil = ILSVRC.DataILSVRC(base_dir='/data/hadoop/ImageNet/ILSVRC/ILSVRC2013_DET_val', category='Train_2') | |
35 | 35 | # dil = ILSVRC.DataILSVRC(base_dir='/media/chunk/Elements/D/data/ImageNet/img/ILSVRC2013_DET_val', category='Train_1') |
36 | 36 | |
37 | 37 | dil.format() |
38 | - dil.embed(rate=0.1) | |
39 | - dil.extract_feat(feattype='ibd') | |
38 | + # dil.embed(rate=0.1) | |
39 | + # dil.extract_feat(feattype='ibd') | |
40 | 40 | # dil.extract_feat(feattype='hog') |
41 | 41 | |
42 | 42 | timer.mark() |
43 | 43 | dil.store_img() |
44 | 44 | timer.report() |
45 | - | |
46 | - timer.mark() | |
47 | - dil.store_tag() | |
48 | - timer.report() | |
49 | 45 | |
50 | - timer.mark() | |
51 | - dil.store_info() | |
52 | - timer.report() | |
46 | + # timer.mark() | |
47 | + # dil.store_tag() | |
48 | + # timer.report() | |
49 | + # | |
50 | + # timer.mark() | |
51 | + # dil.store_info() | |
52 | + # timer.report() | |
53 | + # | |
54 | + # timer.mark() | |
55 | + # dil.store_feat() | |
56 | + # timer.report() | |
53 | 57 | |
54 | - timer.mark() | |
55 | - dil.store_feat() | |
56 | - timer.report() | |
57 | 58 | |
59 | +def test_ILSVRC_S(): | |
60 | + timer = Timer() | |
61 | + dils = ILSVRC_S.DataILSVRC_S(base_dir='/data/hadoop/ImageNet/ILSVRC/ILSVRC2013_DET_val', category='Train_2') | |
62 | + | |
63 | + dils._extract_data(mode='hbase', writeback=True) | |
64 | + dils._embed_data(mode='hbase', rate=0.1, readforward=True, writeback=True) | |
65 | + dils._extract_feat( mode='hbase', feattype='ibd', readforward=True, writeback=True) | |
58 | 66 | |
59 | 67 | if __name__ == '__main__': |
60 | 68 | # test_MSR() | ... | ... |