Commit 24768a990fbda84a2df0b387178ce4039863d423

Authored by Chunk
1 parent f25fd27c
Exists in master and in 1 other branch refactor

mode 'hbase' finished. (testing is sometimes interesting, especially when your c…

…ode is well structured and with few bugs! )
mdata/ILSVRC-S.py
... ... @@ -1,359 +0,0 @@
1   -__author__ = 'chunk'
2   -
3   -from . import *
4   -from ..mfeat import HOG, IntraBlockDiff
5   -from ..mspark import SC
6   -from ..common import *
7   -
8   -import os, sys
9   -from PIL import Image
10   -from hashlib import md5
11   -import csv
12   -import shutil
13   -import json
14   -import collections
15   -import happybase
16   -
17   -from ..mjpeg import *
18   -from ..msteg import *
19   -from ..msteg.steganography import LSB, F3, F4, F5
20   -
21   -import numpy as np
22   -from numpy.random import randn
23   -import pandas as pd
24   -from scipy import stats
25   -
26   -from subprocess import Popen, PIPE, STDOUT
27   -import tempfile
28   -
29   -np.random.seed(sum(map(ord, "whoami")))
30   -
31   -package_dir = os.path.dirname(os.path.abspath(__file__))
32   -
33   -
34   -class DataILSVRCS(DataDumperBase):
35   - """
36   - This module is specially for ILSVRC data processing under spark & hbase.
37   -
38   - We posit that the DB(e.g. HBase) has only the images data with md5 name as id.
39   - The task is to gennerate info(size,capacity,quality,etc.) and class & chosen tags, and then to perform embedding and finally to calcculate ibd features.
40   -
41   - Each step includes reading from & writing to Hbase (though PC).
42   - And each step must have a 'spark' mode option, which means that the operation is performed by spark with reading & wrting through RDDs.
43   -
44   - chunkplus@gmail.com
45   - """
46   -
47   - def __init__(self, base_dir='/media/chunk/Elements/D/data/ImageNet/img/ILSVRC2013_DET_val', category='Train'):
48   - DataDumperBase.__init__(self, base_dir, category)
49   -
50   - self.base_dir = base_dir
51   - self.category = category
52   -
53   - self.dict_data = {}
54   -
55   - self.table_name = self.base_dir.strip('/').split('/')[-1] + '-' + self.category
56   - self.sparkcontex = None
57   -
58   -
59   - def _get_table(self):
60   - if self.table != None:
61   - return self.table
62   -
63   - if self.connection is None:
64   - c = happybase.Connection('HPC-server')
65   - self.connection = c
66   -
67   - tables = self.connection.tables()
68   - if self.table_name not in tables:
69   - families = {'cf_pic': dict(),
70   - 'cf_info': dict(max_versions=10),
71   - 'cf_tag': dict(),
72   - 'cf_feat': dict(),
73   - }
74   - self.connection.create_table(name=self.table_name, families=families)
75   -
76   - table = self.connection.table(name=self.table_name)
77   -
78   - self.table = table
79   -
80   - return table
81   -
82   - def _get_info(self, img, info_rate=None, tag_chosen=None, tag_class=None):
83   - """
84   - Tempfile is our friend. (?)
85   - """
86   - info_rate = info_rate if info_rate != None else 0.0
87   - tag_chosen = tag_chosen if tag_chosen != None else stats.bernoulli.rvs(0.8)
88   - tag_class = tag_class if tag_class != None else 0
89   - try:
90   - tmpf = tempfile.NamedTemporaryFile(suffix='.jpg', mode='w+b')
91   - tmpf.write(img)
92   - # tmpf.seek(0)
93   - im = Jpeg(tmpf.name, key=sample_key)
94   - info = [im.image_width,
95   - im.image_height,
96   - im.image_width * im.image_height,
97   - im.getCapacity(),
98   - im.getQuality(),
99   - info_rate,
100   - tag_chosen,
101   - tag_class]
102   - return info
103   - except Exception as e:
104   - print e
105   - finally:
106   - tmpf.close()
107   -
108   - def _get_feat(self, image, feattype='ibd', **kwargs):
109   - size = kwargs.get('size', (48, 48))
110   -
111   - if feattype == 'hog':
112   - feater = HOG.FeatHOG(size=size)
113   - elif feattype == 'ibd':
114   - feater = IntraBlockDiff.FeatIntraBlockDiff()
115   - else:
116   - raise Exception("Unknown feature type!")
117   -
118   - desc = feater.feat(image)
119   -
120   - return desc
121   -
122   - def _extract_data(self, mode='hbase', writeback=False):
123   - """
124   - Get info barely out of image data.
125   - """
126   - if mode == 'hbase':
127   - if self.table == None:
128   - self.table = self.get_table()
129   -
130   - cols = ['cf_pic:data']
131   - for key, data in self.table.scan(columns=cols, scan_batching=True):
132   - self.dict_data[key] = [data] + self._get_info(data)
133   -
134   - if not writeback:
135   - return self.dict_data
136   - else:
137   - try:
138   - with self.table.batch(batch_size=5000) as b:
139   - for imgname, imginfo in self.dict_data.items():
140   - b.put(imgname,
141   - {
142   - # 'cf_pic:data': imginfo[0],
143   - 'cf_info:width': imginfo[1],
144   - 'cf_info:height': imginfo[2],
145   - 'cf_info:size': imginfo[3],
146   - 'cf_info:capacity': imginfo[4],
147   - 'cf_info:quality': imginfo[5],
148   - 'cf_info:rate': imginfo[6],
149   - 'cf_tag:chosen': imginfo[7],
150   - 'cf_tag:class': imginfo[8], })
151   - except ValueError:
152   - raise
153   -
154   -
155   - elif mode == 'spark':
156   - pass
157   - else:
158   - raise Exception("Unknown mode!")
159   -
160   -
161   - def _embed_data(self, mode='hbase', rate=None, readforward=False, writeback=False):
162   - f5 = F5.F5(sample_key, 1)
163   - if mode == 'hbase':
164   - if self.table == None:
165   - self.table = self.get_table()
166   -
167   - if readforward:
168   - self.dict_data = {}
169   - cols = ['cf_pic:data',
170   - 'cf_info:width',
171   - 'cf_info:height',
172   - 'cf_info:size',
173   - 'cf_info:capacity',
174   - 'cf_info:quality',
175   - 'cf_info:rate',
176   - 'cf_tag:chosen',
177   - 'cf_tag:class']
178   - for key, data in self.table.scan(columns=cols, scan_batching=True):
179   - self.dict_data[key] = data
180   -
181   - dict_data_ext = {}
182   -
183   - for imgname, imgdata in self.dict_data.items():
184   - try:
185   - tmpf_src = tempfile.NamedTemporaryFile(suffix='.jpg', mode='w+b')
186   - tmpf_src.write(imgdata[0])
187   - tmpf_dst = tempfile.NamedTemporaryFile(suffix='.jpg', mode='w+b')
188   -
189   - if rate == None:
190   - embed_rate = f5.embed_raw_data(tmpf_src, os.path.join(package_dir, '../res/toembed'), tmpf_dst)
191   - else:
192   - assert (rate >= 0 and rate < 1)
193   - # print capacity
194   - hidden = np.random.bytes(int(imgdata[4] * rate) / 8)
195   - embed_rate = f5.embed_raw_data(tmpf_src, hidden, tmpf_dst, frommem=True)
196   -
197   - tmpf_dst.seek(0)
198   - raw = tmpf_dst.read()
199   - index = md5(raw).hexdigest()
200   - dict_data_ext[index + '.jpg'] = [raw] + self._get_info(raw, embed_rate, 0, 1)
201   -
202   -
203   - except Exception as e:
204   - print e
205   - finally:
206   - tmpf_src.close()
207   - tmpf_dst.close()
208   -
209   - self.dict_data.update(dict_data_ext)
210   -
211   - if not writeback:
212   - return self.dict_data
213   - else:
214   - try:
215   - with self.table.batch(batch_size=5000) as b:
216   - for imgname, imginfo in dict_data_ext.items():
217   - b.put(imgname,
218   - {
219   - 'cf_pic:data': imginfo[0],
220   - 'cf_info:width': imginfo[1],
221   - 'cf_info:height': imginfo[2],
222   - 'cf_info:size': imginfo[3],
223   - 'cf_info:capacity': imginfo[4],
224   - 'cf_info:quality': imginfo[5],
225   - 'cf_info:rate': imginfo[6],
226   - 'cf_tag:chosen': imginfo[7],
227   - 'cf_tag:class': imginfo[8], })
228   - except ValueError:
229   - raise
230   -
231   - elif mode == 'spark':
232   - pass
233   - else:
234   - raise Exception("Unknown mode!")
235   -
236   -
237   - def _extract_feat(self, mode='hbase', feattype='ibd', readforward=False, writeback=False, **kwargs):
238   - if mode == 'hbase':
239   - if self.table == None:
240   - self.table = self.get_table()
241   -
242   - if readforward:
243   - self.dict_data = {}
244   - cols = ['cf_pic:data',
245   - 'cf_info:width',
246   - 'cf_info:height',
247   - 'cf_info:size',
248   - 'cf_info:capacity',
249   - 'cf_info:quality',
250   - 'cf_info:rate',
251   - 'cf_tag:chosen',
252   - 'cf_tag:class']
253   - for key, data in self.table.scan(columns=cols, scan_batching=True):
254   - self.dict_data[key] = data
255   -
256   - for imgname, imgdata in self.dict_data.items():
257   - try:
258   - tmpf_src = tempfile.NamedTemporaryFile(suffix='.jpg', mode='w+b')
259   - tmpf_src.write(imgdata[0])
260   -
261   - desc = json.dumps(self._get_feat(tmpf_src, feattype=feattype))
262   -
263   - self.dict_data[imgname].append(desc)
264   -
265   - except Exception as e:
266   - print e
267   - finally:
268   - tmpf_src.close()
269   -
270   - if not writeback:
271   - return self.dict_data
272   - else:
273   - try:
274   - with self.table.batch(batch_size=5000) as b:
275   - for imgname, imginfo in self.dict_data.items():
276   - b.put(imgname,
277   - {
278   - 'cf_pic:data': imginfo[0],
279   - 'cf_info:width': imginfo[1],
280   - 'cf_info:height': imginfo[2],
281   - 'cf_info:size': imginfo[3],
282   - 'cf_info:capacity': imginfo[4],
283   - 'cf_info:quality': imginfo[5],
284   - 'cf_info:rate': imginfo[6],
285   - 'cf_tag:chosen': imginfo[7],
286   - 'cf_tag:class': imginfo[8],
287   - 'cf_feat:' + feattype: imginfo[9]})
288   - except ValueError:
289   - raise
290   -
291   - elif mode == 'spark':
292   - pass
293   - else:
294   - raise Exception("Unknown mode!")
295   -
296   -
297   - def format(self):
298   - self._extract_data(mode='hbase', writeback=False)
299   - self._embed_data(mode='hbase', rate=0.1, readforward=False, writeback=False)
300   - self._extract_feat(mode='hbase', feattype='ibd', readforward=False, writeback=True)
301   -
302   -
303   - def load_data(self, mode='local', feattype='ibd', tagtype='class'):
304   - INDEX = []
305   - X = []
306   - Y = []
307   -
308   - if mode == "local":
309   -
310   - dict_dataset = {}
311   -
312   - with open(self.list_file, 'rb') as tsvfile:
313   - tsvfile = csv.reader(tsvfile, delimiter='\t')
314   - for line in tsvfile:
315   - hash = line[0]
316   - tag = line[-1]
317   - path_feat = os.path.join(self.feat_dir, hash[:3], hash[3:] + '.' + feattype)
318   - if path_feat:
319   - with open(path_feat, 'rb') as featfile:
320   - dict_dataset[hash] = (tag, json.loads(featfile.read()))
321   -
322   - for tag, feat in dict_dataset.values():
323   - X.append([item for sublist in feat for subsublist in sublist for item in subsublist])
324   - Y.append(int(tag))
325   -
326   - elif mode == "remote" or mode == "hbase":
327   - if self.table == None:
328   - self.table = self.get_table()
329   -
330   - col_feat, col_tag = 'cf_feat:' + feattype, 'cf_tag:' + tagtype
331   - for key, data in self.table.scan(columns=[col_feat, col_tag]):
332   - X.append(json.loads(data[col_feat]))
333   - Y.append(1 if data[col_tag] == 'True' else 0)
334   -
335   - elif mode == "spark" or mode == "cluster":
336   - if self.sparkcontex == None:
337   - self.sparkcontex = SC.Sparker(host='HPC-server', appname='ImageCV', master='spark://HPC-server:7077')
338   -
339   - result = self.sparkcontex.read_hbase(self.table_name) # result = {key:[feat,tag],...}
340   - for feat, tag in result:
341   - X.append(feat)
342   - Y.append(tag)
343   -
344   - else:
345   - raise Exception("Unknown mode!")
346   -
347   - return X, Y
348   -
349   -
350   -
351   -
352   -
353   -
354   -
355   -
356   -
357   -
358   -
359   -
mdata/ILSVRC.py
... ... @@ -302,7 +302,7 @@ class DataILSVRC(DataDumperBase):
302 302 tsvfile = csv.reader(tsvfile, delimiter='\t')
303 303 for line in tsvfile:
304 304 path_img = os.path.join(self.img_dir, line[0][:3], line[0][3:] + '.jpg')
305   - if path_img:
  305 + if path_img:
306 306 with open(path_img, 'rb') as fpic:
307 307 dict_databuf[line[0] + '.jpg'] = fpic.read()
308 308  
... ...
mdata/ILSVRC_S.py 0 → 100644
... ... @@ -0,0 +1,367 @@
  1 +__author__ = 'chunk'
  2 +
  3 +from . import *
  4 +from ..mfeat import HOG, IntraBlockDiff
  5 +from ..mspark import SC
  6 +from ..common import *
  7 +
  8 +import os, sys
  9 +from PIL import Image
  10 +from hashlib import md5
  11 +import csv
  12 +import shutil
  13 +import json
  14 +import collections
  15 +import happybase
  16 +
  17 +from ..mjpeg import *
  18 +from ..msteg import *
  19 +from ..msteg.steganography import LSB, F3, F4, F5
  20 +
  21 +import numpy as np
  22 +from numpy.random import randn
  23 +import pandas as pd
  24 +from scipy import stats
  25 +
  26 +from subprocess import Popen, PIPE, STDOUT
  27 +import tempfile
  28 +
  29 +np.random.seed(sum(map(ord, "whoami")))
  30 +
  31 +package_dir = os.path.dirname(os.path.abspath(__file__))
  32 +
  33 +
  34 +class DataILSVRC_S(DataDumperBase):
  35 + """
  36 + This module is specially for ILSVRC data processing under spark & hbase.
  37 +
  38 + We posit that the DB(e.g. HBase) has only the images data with md5 name as id.
  39 + The task is to gennerate info(size,capacity,quality,etc.) and class & chosen tags, and then to perform embedding and finally to calcculate ibd features.
  40 +
  41 + Each step includes reading from & writing to Hbase (though PC).
  42 + And each step must have a 'spark' mode option, which means that the operation is performed by spark with reading & wrting through RDDs.
  43 +
  44 + chunkplus@gmail.com
  45 + """
  46 +
  47 + def __init__(self, base_dir='/media/chunk/Elements/D/data/ImageNet/img/ILSVRC2013_DET_val', category='Train'):
  48 + DataDumperBase.__init__(self, base_dir, category)
  49 +
  50 + self.base_dir = base_dir
  51 + self.category = category
  52 +
  53 + self.dict_data = {}
  54 +
  55 + self.table_name = self.base_dir.strip('/').split('/')[-1] + '-' + self.category
  56 + self.sparkcontex = None
  57 +
  58 +
  59 + def get_table(self):
  60 + if self.table != None:
  61 + return self.table
  62 +
  63 + if self.connection is None:
  64 + c = happybase.Connection('HPC-server')
  65 + self.connection = c
  66 +
  67 + tables = self.connection.tables()
  68 + if self.table_name not in tables:
  69 + families = {'cf_pic': dict(),
  70 + 'cf_info': dict(max_versions=10),
  71 + 'cf_tag': dict(),
  72 + 'cf_feat': dict(),
  73 + }
  74 + self.connection.create_table(name=self.table_name, families=families)
  75 +
  76 + table = self.connection.table(name=self.table_name)
  77 +
  78 + self.table = table
  79 +
  80 + return table
  81 +
  82 + def _get_info(self, img, info_rate=None, tag_chosen=None, tag_class=None):
  83 + """
  84 + Tempfile is our friend. (?)
  85 + """
  86 + info_rate = info_rate if info_rate != None else 0.0
  87 + tag_chosen = tag_chosen if tag_chosen != None else stats.bernoulli.rvs(0.8)
  88 + tag_class = tag_class if tag_class != None else 0
  89 + try:
  90 + tmpf = tempfile.NamedTemporaryFile(suffix='.jpg', mode='w+b')
  91 + tmpf.write(img)
  92 + tmpf.seek(0)
  93 + im = Jpeg(tmpf.name, key=sample_key)
  94 + info = [str(im.image_width),
  95 + str(im.image_height),
  96 + str(im.image_width * im.image_height),
  97 + str(im.getCapacity()),
  98 + str(im.getQuality()),
  99 + str(info_rate),
  100 + str(tag_chosen),
  101 + str(tag_class)]
  102 + return info
  103 + except Exception as e:
  104 + print e
  105 + finally:
  106 + tmpf.close()
  107 +
  108 + def _get_feat(self, image, feattype='ibd', **kwargs):
  109 + size = kwargs.get('size', (48, 48))
  110 +
  111 + if feattype == 'hog':
  112 + feater = HOG.FeatHOG(size=size)
  113 + elif feattype == 'ibd':
  114 + feater = IntraBlockDiff.FeatIntraBlockDiff()
  115 + else:
  116 + raise Exception("Unknown feature type!")
  117 +
  118 + desc = feater.feat(image)
  119 +
  120 + return desc
  121 +
  122 + def _extract_data(self, mode='hbase', writeback=False):
  123 + """
  124 + Get info barely out of image data.
  125 + """
  126 + if mode == 'hbase':
  127 + if self.table == None:
  128 + self.table = self.get_table()
  129 +
  130 + cols = ['cf_pic:data']
  131 + for key, data in self.table.scan(columns=cols):
  132 + data = data['cf_pic:data']
  133 + self.dict_data[key] = [data] + self._get_info(data)
  134 +
  135 + if not writeback:
  136 + return self.dict_data
  137 + else:
  138 + try:
  139 + with self.table.batch(batch_size=5000) as b:
  140 + for imgname, imginfo in self.dict_data.items():
  141 + b.put(imgname,
  142 + {
  143 + # 'cf_pic:data': imginfo[0],
  144 + 'cf_info:width': imginfo[1],
  145 + 'cf_info:height': imginfo[2],
  146 + 'cf_info:size': imginfo[3],
  147 + 'cf_info:capacity': imginfo[4],
  148 + 'cf_info:quality': imginfo[5],
  149 + 'cf_info:rate': imginfo[6],
  150 + 'cf_tag:chosen': imginfo[7],
  151 + 'cf_tag:class': imginfo[8],
  152 + })
  153 + except ValueError:
  154 + raise
  155 +
  156 +
  157 + elif mode == 'spark':
  158 + pass
  159 + else:
  160 + raise Exception("Unknown mode!")
  161 +
  162 +
  163 + def _embed_data(self, mode='hbase', rate=None, readforward=False, writeback=False):
  164 + f5 = F5.F5(sample_key, 1)
  165 + if mode == 'hbase':
  166 + if self.table == None:
  167 + self.table = self.get_table()
  168 +
  169 + if readforward:
  170 + self.dict_data = {}
  171 + cols = ['cf_pic:data',
  172 + 'cf_info:width',
  173 + 'cf_info:height',
  174 + 'cf_info:size',
  175 + 'cf_info:capacity',
  176 + 'cf_info:quality',
  177 + 'cf_info:rate',
  178 + 'cf_tag:chosen',
  179 + 'cf_tag:class']
  180 + for key, data in self.table.scan(columns=cols):
  181 + data = [data[k] for k in cols]
  182 + self.dict_data[key] = data
  183 +
  184 + dict_data_ext = {}
  185 +
  186 + for imgname, imgdata in self.dict_data.items():
  187 + try:
  188 + tmpf_src = tempfile.NamedTemporaryFile(suffix='.jpg', mode='w+b')
  189 + tmpf_src.write(imgdata[0])
  190 + tmpf_src.seek(0)
  191 + tmpf_dst = tempfile.NamedTemporaryFile(suffix='.jpg', mode='w+b')
  192 +
  193 + if rate == None:
  194 + embed_rate = f5.embed_raw_data(tmpf_src.name, os.path.join(package_dir, '../res/toembed'), tmpf_dst.name)
  195 + else:
  196 + assert (rate >= 0 and rate < 1)
  197 + # print capacity
  198 + hidden = np.random.bytes(int(int(imgdata[4]) * rate) / 8)
  199 + embed_rate = f5.embed_raw_data(tmpf_src.name, hidden, tmpf_dst.name, frommem=True)
  200 +
  201 + tmpf_dst.seek(0)
  202 + raw = tmpf_dst.read()
  203 + index = md5(raw).hexdigest()
  204 + dict_data_ext[index + '.jpg'] = [raw] + self._get_info(raw, embed_rate, 0, 1)
  205 +
  206 +
  207 + except Exception as e:
  208 + print e
  209 + raise
  210 + finally:
  211 + tmpf_src.close()
  212 + tmpf_dst.close()
  213 +
  214 + self.dict_data.update(dict_data_ext)
  215 +
  216 + if not writeback:
  217 + return self.dict_data
  218 + else:
  219 + try:
  220 + with self.table.batch(batch_size=5000) as b:
  221 + for imgname, imginfo in dict_data_ext.items():
  222 + b.put(imgname,
  223 + {
  224 + 'cf_pic:data': imginfo[0],
  225 + 'cf_info:width': imginfo[1],
  226 + 'cf_info:height': imginfo[2],
  227 + 'cf_info:size': imginfo[3],
  228 + 'cf_info:capacity': imginfo[4],
  229 + 'cf_info:quality': imginfo[5],
  230 + 'cf_info:rate': imginfo[6],
  231 + 'cf_tag:chosen': imginfo[7],
  232 + 'cf_tag:class': imginfo[8], })
  233 + except ValueError:
  234 + raise
  235 +
  236 + elif mode == 'spark':
  237 + pass
  238 + else:
  239 + raise Exception("Unknown mode!")
  240 +
  241 +
  242 + def _extract_feat(self, mode='hbase', feattype='ibd', readforward=False, writeback=False, **kwargs):
  243 + if mode == 'hbase':
  244 + if self.table == None:
  245 + self.table = self.get_table()
  246 +
  247 + if readforward:
  248 + self.dict_data = {}
  249 + cols = ['cf_pic:data',
  250 + 'cf_info:width',
  251 + 'cf_info:height',
  252 + 'cf_info:size',
  253 + 'cf_info:capacity',
  254 + 'cf_info:quality',
  255 + 'cf_info:rate',
  256 + 'cf_tag:chosen',
  257 + 'cf_tag:class']
  258 + for key, data in self.table.scan(columns=cols):
  259 + data = [data[k] for k in cols]
  260 + self.dict_data[key] = data
  261 +
  262 + for imgname, imgdata in self.dict_data.items():
  263 + try:
  264 + tmpf_src = tempfile.NamedTemporaryFile(suffix='.jpg', mode='w+b')
  265 + tmpf_src.write(imgdata[0])
  266 + tmpf_src.seek(0)
  267 +
  268 + desc = json.dumps(self._get_feat(tmpf_src.name, feattype=feattype).tolist())
  269 +
  270 + self.dict_data[imgname].append(desc)
  271 +
  272 + except Exception as e:
  273 + print e
  274 + raise
  275 + finally:
  276 + tmpf_src.close()
  277 +
  278 + if not writeback:
  279 + return self.dict_data
  280 + else:
  281 + try:
  282 + with self.table.batch(batch_size=5000) as b:
  283 + for imgname, imginfo in self.dict_data.items():
  284 + b.put(imgname,
  285 + {
  286 + 'cf_pic:data': imginfo[0],
  287 + 'cf_info:width': imginfo[1],
  288 + 'cf_info:height': imginfo[2],
  289 + 'cf_info:size': imginfo[3],
  290 + 'cf_info:capacity': imginfo[4],
  291 + 'cf_info:quality': imginfo[5],
  292 + 'cf_info:rate': imginfo[6],
  293 + 'cf_tag:chosen': imginfo[7],
  294 + 'cf_tag:class': imginfo[8],
  295 + 'cf_feat:' + feattype: imginfo[9]})
  296 + except ValueError:
  297 + raise
  298 +
  299 + elif mode == 'spark':
  300 + pass
  301 + else:
  302 + raise Exception("Unknown mode!")
  303 +
  304 +
  305 + def format(self):
  306 + self._extract_data(mode='hbase', writeback=False)
  307 + self._embed_data(mode='hbase', rate=0.1, readforward=False, writeback=False)
  308 + self._extract_feat(mode='hbase', feattype='ibd', readforward=False, writeback=True)
  309 +
  310 +
  311 + def load_data(self, mode='local', feattype='ibd', tagtype='class'):
  312 + INDEX = []
  313 + X = []
  314 + Y = []
  315 +
  316 + if mode == "local":
  317 +
  318 + dict_dataset = {}
  319 +
  320 + with open(self.list_file, 'rb') as tsvfile:
  321 + tsvfile = csv.reader(tsvfile, delimiter='\t')
  322 + for line in tsvfile:
  323 + hash = line[0]
  324 + tag = line[-1]
  325 + path_feat = os.path.join(self.feat_dir, hash[:3], hash[3:] + '.' + feattype)
  326 + if path_feat:
  327 + with open(path_feat, 'rb') as featfile:
  328 + dict_dataset[hash] = (tag, json.loads(featfile.read()))
  329 +
  330 + for tag, feat in dict_dataset.values():
  331 + X.append([item for sublist in feat for subsublist in sublist for item in subsublist])
  332 + Y.append(int(tag))
  333 +
  334 + elif mode == "remote" or mode == "hbase":
  335 + if self.table == None:
  336 + self.table = self.get_table()
  337 +
  338 + col_feat, col_tag = 'cf_feat:' + feattype, 'cf_tag:' + tagtype
  339 + for key, data in self.table.scan(columns=[col_feat, col_tag]):
  340 + X.append(json.loads(data[col_feat]))
  341 + Y.append(1 if data[col_tag] == 'True' else 0)
  342 +
  343 + elif mode == "spark" or mode == "cluster":
  344 + if self.sparkcontex == None:
  345 + self.sparkcontex = SC.Sparker(host='HPC-server', appname='ImageCV', master='spark://HPC-server:7077')
  346 +
  347 + result = self.sparkcontex.read_hbase(self.table_name) # result = {key:[feat,tag],...}
  348 + for feat, tag in result:
  349 + X.append(feat)
  350 + Y.append(tag)
  351 +
  352 + else:
  353 + raise Exception("Unknown mode!")
  354 +
  355 + return X, Y
  356 +
  357 +
  358 +
  359 +
  360 +
  361 +
  362 +
  363 +
  364 +
  365 +
  366 +
  367 +
... ...
test/test_data.py
... ... @@ -2,7 +2,7 @@ __author__ = &#39;chunk&#39;
2 2  
3 3 from ..common import *
4 4  
5   -from ..mdata import MSR, CV, ILSVRC
  5 +from ..mdata import MSR, CV, ILSVRC, ILSVRC_S
6 6  
7 7  
8 8 def test_MSR():
... ... @@ -31,30 +31,38 @@ def test_CV():
31 31 def test_ILSVRC():
32 32 timer = Timer()
33 33 # dil = ILSVRC.DataILSVRC(base_dir='/data/hadoop/ImageNet/ILSVRC/ILSVRC2013_DET_val', category='Train')
34   - dil = ILSVRC.DataILSVRC(base_dir='/data/hadoop/ImageNet/ILSVRC/ILSVRC2013_DET_val', category='Train_1')
  34 + dil = ILSVRC.DataILSVRC(base_dir='/data/hadoop/ImageNet/ILSVRC/ILSVRC2013_DET_val', category='Train_2')
35 35 # dil = ILSVRC.DataILSVRC(base_dir='/media/chunk/Elements/D/data/ImageNet/img/ILSVRC2013_DET_val', category='Train_1')
36 36  
37 37 dil.format()
38   - dil.embed(rate=0.1)
39   - dil.extract_feat(feattype='ibd')
  38 + # dil.embed(rate=0.1)
  39 + # dil.extract_feat(feattype='ibd')
40 40 # dil.extract_feat(feattype='hog')
41 41  
42 42 timer.mark()
43 43 dil.store_img()
44 44 timer.report()
45   -
46   - timer.mark()
47   - dil.store_tag()
48   - timer.report()
49 45  
50   - timer.mark()
51   - dil.store_info()
52   - timer.report()
  46 + # timer.mark()
  47 + # dil.store_tag()
  48 + # timer.report()
  49 + #
  50 + # timer.mark()
  51 + # dil.store_info()
  52 + # timer.report()
  53 + #
  54 + # timer.mark()
  55 + # dil.store_feat()
  56 + # timer.report()
53 57  
54   - timer.mark()
55   - dil.store_feat()
56   - timer.report()
57 58  
  59 +def test_ILSVRC_S():
  60 + timer = Timer()
  61 + dils = ILSVRC_S.DataILSVRC_S(base_dir='/data/hadoop/ImageNet/ILSVRC/ILSVRC2013_DET_val', category='Train_2')
  62 +
  63 + dils._extract_data(mode='hbase', writeback=True)
  64 + dils._embed_data(mode='hbase', rate=0.1, readforward=True, writeback=True)
  65 + dils._extract_feat( mode='hbase', feattype='ibd', readforward=True, writeback=True)
58 66  
59 67 if __name__ == '__main__':
60 68 # test_MSR()
... ...