2c2d57c7
Chunk
ILSVRC datapath h...
|
1
2
3
|
__author__ = 'chunk'
from . import *
|
84648488
Chunk
reverted.
|
4
|
from ..mfeat import HOG, IntraBlockDiff
|
2c2d57c7
Chunk
ILSVRC datapath h...
|
5
6
7
8
9
10
11
12
13
14
15
16
17
18
|
from ..mspark import SC
from ..common import *
import os, sys
from PIL import Image
from hashlib import md5
import csv
import shutil
import json
import collections
import happybase
from ..mjpeg import *
from ..msteg import *
|
9ff70cf4
Chunk
capacity engeneer...
|
19
|
|
2c2d57c7
Chunk
ILSVRC datapath h...
|
20
|
import os
|
d1042d03
Chunk
staged.
|
21
22
23
24
|
import numpy as np
from numpy.random import randn
import pandas as pd
from scipy import stats
|
ec755e37
Chunk
cropping.
|
25
|
|
d1042d03
Chunk
staged.
|
26
|
|
080c30c2
Chunk
F5 lib updated. I...
|
27
28
|
np.random.seed(sum(map(ord, "whoami")))
|
84648488
Chunk
reverted.
|
29
|
|
d1042d03
Chunk
staged.
|
30
31
|
class DataILSVRC(DataDumperBase):
def __init__(self, base_dir='/media/chunk/Elements/D/data/ImageNet/img/ILSVRC2013_DET_val', category='Train'):
|
080c30c2
Chunk
F5 lib updated. I...
|
32
33
|
DataDumperBase.__init__(self, base_dir, category)
|
2c2d57c7
Chunk
ILSVRC datapath h...
|
34
35
|
self.base_dir = base_dir
self.category = category
|
84648488
Chunk
reverted.
|
36
|
self.data_dir = os.path.join(self.base_dir, self.category)
|
2c2d57c7
Chunk
ILSVRC datapath h...
|
37
38
39
40
41
42
43
44
45
46
47
48
49
50
|
self.dst_dir = os.path.join(self.base_dir, 'dst', self.category)
self.list_file = os.path.join(self.dst_dir, 'file-tag.tsv')
self.feat_dir = os.path.join(self.dst_dir, 'Feat')
self.img_dir = os.path.join(self.dst_dir, 'Img')
self.dict_data = {}
self.table_name = self.base_dir.strip('/').split('/')[-1] + '-' + self.category
self.sparkcontex = None
def format(self):
self.extract()
|
02528074
Chunk
staged.
|
51
|
def _hash_copy(self, image):
|
2c2d57c7
Chunk
ILSVRC datapath h...
|
52
53
|
if not image.endswith('jpg'):
img = Image.open(image)
|
f1fa5b17
Chunk
review & streaming.
|
54
|
img.save('res/tmp.jpg', format='JPEG')
|
2c2d57c7
Chunk
ILSVRC datapath h...
|
55
56
57
58
59
|
image = 'res/tmp.jpg'
with open(image, 'rb') as f:
index = md5(f.read()).hexdigest()
|
080c30c2
Chunk
F5 lib updated. I...
|
60
61
|
im = Jpeg(image, key=sample_key)
self.dict_data[index] = [im.image_width, im.image_height, os.path.getsize(image), im.getQuality()]
|
2c2d57c7
Chunk
ILSVRC datapath h...
|
62
63
64
65
66
|
# origion:
# dir = base_dir + 'Img/Train/' + index[:3]
dir = os.path.join(self.img_dir, index[:3])
if not os.path.exists(dir):
|
84648488
Chunk
reverted.
|
67
|
os.makedirs(dir)
|
9ff70cf4
Chunk
capacity engeneer...
|
68
|
image_path = os.path.join(dir, index[3:] + '.jpg')
|
d0be60e7
Chunk
jpeg update.
|
69
70
|
# print image_path
|
2c2d57c7
Chunk
ILSVRC datapath h...
|
71
72
|
if not os.path.exists(image_path):
shutil.copy(image, image_path)
|
1dc7c44b
Chunk
crawler-hbase-spa...
|
73
|
else:
|
2c2d57c7
Chunk
ILSVRC datapath h...
|
74
75
76
77
78
79
80
81
82
83
84
|
pass
def _build_list(self):
assert self.list_file != None
ordict_img = collections.OrderedDict(sorted(self.dict_data.items(), key=lambda d: d[0]))
with open(self.list_file, 'w') as f:
tsvfile = csv.writer(f, delimiter='\t')
for key, value in ordict_img.items():
tsvfile.writerow([key] + value)
|
554a7b9a
Chunk
staged.
|
85
|
|
84648488
Chunk
reverted.
|
86
87
88
89
90
|
def _anaylis(self):
df_ILS = pd.read_csv(self.list_file, names=['hash', 'width', 'height', 'size', 'quality'], sep='\t')
length = df_ILS.shape[0]
df_new = df_ILS.sort(['size', 'quality'], ascending=True)
rand_class = stats.bernoulli.rvs(0.3, size=length)
|
554a7b9a
Chunk
staged.
|
91
92
93
94
95
96
97
98
|
df_new['class'] = pd.Series(rand_class, index=df_new.index)
df_new.to_csv(self.list_file, header=False, index=False, sep='\t')
def extract(self):
for path, subdirs, files in os.walk(self.data_dir):
for name in files:
|
84648488
Chunk
reverted.
|
99
|
imagepath = os.path.join(path, name)
|
554a7b9a
Chunk
staged.
|
100
|
# print imagepath
|
f1fa5b17
Chunk
review & streaming.
|
101
|
try:
|
84648488
Chunk
reverted.
|
102
103
104
|
self._hash_copy(imagepath)
except:
pass
|
554a7b9a
Chunk
staged.
|
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
|
self._build_list()
self._anaylis()
def get_table(self):
if self.table != None:
return self.table
if self.connection is None:
c = happybase.Connection('HPC-server')
self.connection = c
tables = self.connection.tables()
if self.table_name not in tables:
families = {'cf_pic': dict(),
'cf_info': dict(max_versions=10),
'cf_tag': dict(),
'cf_feat': dict(),
}
self.connection.create_table(name=self.table_name, families=families)
table = self.connection.table(name=self.table_name)
self.table = table
return table
|
080c30c2
Chunk
F5 lib updated. I...
|
131
132
133
134
|
def store_image(self):
if self.table == None:
|
2c2d57c7
Chunk
ILSVRC datapath h...
|
135
136
137
|
self.table = self.get_table()
dict_databuf = {}
|
080c30c2
Chunk
F5 lib updated. I...
|
138
|
|
2c2d57c7
Chunk
ILSVRC datapath h...
|
139
140
141
142
|
with open(self.list_file, 'rb') as tsvfile:
tsvfile = csv.reader(tsvfile, delimiter='\t')
for line in tsvfile:
path_img = os.path.join(self.img_dir, line[0][:3], line[0][3:] + '.jpg')
|
080c30c2
Chunk
F5 lib updated. I...
|
143
144
145
146
147
|
if path_img:
with open(path_img, 'rb') as fpic:
dict_databuf[line[0] + '.jpg'] = fpic.read()
try:
|
84648488
Chunk
reverted.
|
148
|
with self.table.batch(batch_size=5000) as b:
|
d1042d03
Chunk
staged.
|
149
|
for imgname, imgdata in dict_databuf.items():
|
9ff70cf4
Chunk
capacity engeneer...
|
150
|
b.put(imgname, {'cf_pic:data': imgdata})
|
9371f8fa
Chunk
SVM param engenee...
|
151
|
except ValueError:
|
d1042d03
Chunk
staged.
|
152
|
raise
|
9ff70cf4
Chunk
capacity engeneer...
|
153
|
pass
|
d0be60e7
Chunk
jpeg update.
|
154
155
|
def store_info(self, infotype='all'):
|
d1042d03
Chunk
staged.
|
156
|
if self.table == None:
|
d0be60e7
Chunk
jpeg update.
|
157
|
self.table = self.get_table()
|
2c2d57c7
Chunk
ILSVRC datapath h...
|
158
159
|
dict_infobuf = {}
|
f1fa5b17
Chunk
review & streaming.
|
160
|
|
2c2d57c7
Chunk
ILSVRC datapath h...
|
161
162
163
|
with open(self.list_file, 'rb') as tsvfile:
tsvfile = csv.reader(tsvfile, delimiter='\t')
for line in tsvfile:
|
d1042d03
Chunk
staged.
|
164
165
166
167
168
|
dict_infobuf[line[0] + '.jpg'] = line[2:-1]
if infotype == 'all':
try:
with self.table.batch(batch_size=5000) as b:
|
2c2d57c7
Chunk
ILSVRC datapath h...
|
169
|
for imgname, imginfo in dict_infobuf.items():
|
d1042d03
Chunk
staged.
|
170
171
|
b.put(imgname,
{'cf_info:width': imginfo[0], 'cf_info:height': imginfo[1], 'cf_info:size': imginfo[2],
|
2c2d57c7
Chunk
ILSVRC datapath h...
|
172
|
'cf_info:quality': imginfo[3]})
|
84648488
Chunk
reverted.
|
173
|
except ValueError:
|
9ff70cf4
Chunk
capacity engeneer...
|
174
|
raise
|
080c30c2
Chunk
F5 lib updated. I...
|
175
176
177
|
pass
else:
raise Exception("Unknown mode!")
|
d0be60e7
Chunk
jpeg update.
|
178
|
|
080c30c2
Chunk
F5 lib updated. I...
|
179
|
|
9ff70cf4
Chunk
capacity engeneer...
|
180
|
def store_tag(self, tagtype='class'):
|
84648488
Chunk
reverted.
|
181
|
if self.table == None:
|
080c30c2
Chunk
F5 lib updated. I...
|
182
|
self.table = self.get_table()
|
d0be60e7
Chunk
jpeg update.
|
183
|
|
9ff70cf4
Chunk
capacity engeneer...
|
184
|
dict_tagbuf = {}
|
84648488
Chunk
reverted.
|
185
|
|
d0be60e7
Chunk
jpeg update.
|
186
|
with open(self.list_file, 'rb') as tsvfile:
|
080c30c2
Chunk
F5 lib updated. I...
|
187
188
|
tsvfile = csv.reader(tsvfile, delimiter='\t')
for line in tsvfile:
|
84648488
Chunk
reverted.
|
189
|
dict_tagbuf[line[0] + '.jpg'] = line[-1]
|
9ff70cf4
Chunk
capacity engeneer...
|
190
|
|
d0be60e7
Chunk
jpeg update.
|
191
|
try:
|
080c30c2
Chunk
F5 lib updated. I...
|
192
193
194
195
|
with self.table.batch(batch_size=5000) as b:
for imgname, imgtag in dict_tagbuf.items():
b.put(imgname, {'cf_tag:' + tagtype: imgtag})
except ValueError:
|
9ff70cf4
Chunk
capacity engeneer...
|
196
|
raise
|
080c30c2
Chunk
F5 lib updated. I...
|
197
198
199
200
201
|
pass
def get_feat(self, image, feattype='ibd', **kwargs):
size = kwargs.get('size', (48, 48))
|
080c30c2
Chunk
F5 lib updated. I...
|
202
203
|
if feattype == 'hog':
|
d0be60e7
Chunk
jpeg update.
|
204
|
feater = HOG.FeatHOG(size=size)
|
84648488
Chunk
reverted.
|
205
|
elif feattype == 'ibd':
|
d0be60e7
Chunk
jpeg update.
|
206
207
208
209
210
211
212
213
|
feater = IntraBlockDiff.FeatIntraBlockDiff()
else:
raise Exception("Unknown feature type!")
desc = feater.feat(image)
return desc
|
080c30c2
Chunk
F5 lib updated. I...
|
214
|
def extract_feat(self, feattype='ibd'):
|
9ff70cf4
Chunk
capacity engeneer...
|
215
216
217
218
219
|
if feattype == 'hog':
feater = HOG.FeatHOG(size=(48, 48))
elif feattype == 'ibd':
feater = IntraBlockDiff.FeatIntraBlockDiff()
|
84648488
Chunk
reverted.
|
220
|
else:
|
9ff70cf4
Chunk
capacity engeneer...
|
221
222
223
224
225
226
227
228
229
|
raise Exception("Unknown feature type!")
list_image = []
with open(self.list_file, 'rb') as tsvfile:
tsvfile = csv.reader(tsvfile, delimiter='\t')
for line in tsvfile:
list_image.append(line[0])
dict_featbuf = {}
|
84648488
Chunk
reverted.
|
230
|
for imgname in list_image:
|
9ff70cf4
Chunk
capacity engeneer...
|
231
232
|
# if imgtag == 'True':
image = os.path.join(self.img_dir, imgname[:3], imgname[3:] + '.jpg')
|
9371f8fa
Chunk
SVM param engenee...
|
233
|
desc = feater.feat(image)
|
9ff70cf4
Chunk
capacity engeneer...
|
234
235
236
237
238
239
|
dict_featbuf[imgname] = desc
for imgname, desc in dict_featbuf.items():
# print imgname, desc
dir = os.path.join(self.feat_dir, imgname[:3])
if not os.path.exists(dir):
|
84648488
Chunk
reverted.
|
240
|
os.makedirs(dir)
|
9ff70cf4
Chunk
capacity engeneer...
|
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
|
featpath = os.path.join(dir, imgname[3:].split('.')[0] + '.' + feattype)
with open(featpath, 'wb') as featfile:
featfile.write(json.dumps(desc.tolist()))
def store_feat(self, feattype='ibd'):
if self.table == None:
self.table = self.get_table()
dict_featbuf = {}
for path, subdirs, files in os.walk(self.feat_dir):
for name in files:
featpath = os.path.join(path, name)
# print featpath
with open(featpath, 'rb') as featfile:
imgname = path.split('/')[-1] + name.replace('.' + feattype, '.jpg')
dict_featbuf[imgname] = featfile.read()
|
84648488
Chunk
reverted.
|
259
|
try:
|
9ff70cf4
Chunk
capacity engeneer...
|
260
261
262
263
264
265
266
267
268
269
270
271
|
with self.table.batch(batch_size=5000) as b:
for imgname, featdesc in dict_featbuf.items():
b.put(imgname, {'cf_feat:' + feattype: featdesc})
except ValueError:
raise
pass
def load_data(self, mode='local', feattype='ibd', tagtype='class'):
INDEX = []
X = []
Y = []
|
554a7b9a
Chunk
staged.
|
272
|
if mode == "local":
|
f1fa5b17
Chunk
review & streaming.
|
273
|
|
9371f8fa
Chunk
SVM param engenee...
|
274
|
dict_tagbuf = {}
|
9ff70cf4
Chunk
capacity engeneer...
|
275
|
with open(self.list_file, 'rb') as tsvfile:
|
84648488
Chunk
reverted.
|
276
|
tsvfile = csv.reader(tsvfile, delimiter='\t')
|
ec755e37
Chunk
cropping.
|
277
|
for line in tsvfile:
|
e6be6b61
Chunk
import caffe.
|
278
279
280
|
imgname = line[0] + '.jpg'
dict_tagbuf[imgname] = line[-1]
|
ec755e37
Chunk
cropping.
|
281
282
283
284
285
286
287
288
|
dict_dataset = {}
for path, subdirs, files in os.walk(self.feat_dir):
for name in files:
featpath = os.path.join(path, name)
with open(featpath, 'rb') as featfile:
imgname = path.split('/')[-1] + name.replace('.' + feattype, '.jpg')
dict_dataset[imgname] = json.loads(featfile.read())
|
b9990e77
Chunk
staged.
|
289
|
for imgname, tag in dict_tagbuf.items():
|
ec755e37
Chunk
cropping.
|
290
291
292
293
294
|
tag = 1 if tag == 'True' else 0
INDEX.append(imgname)
X.append(dict_dataset[imgname])
Y.append(tag)
|
e6be6b61
Chunk
import caffe.
|
295
|
elif mode == "remote" or mode == "hbase":
|
ec755e37
Chunk
cropping.
|
296
297
298
299
300
301
|
if self.table == None:
self.table = self.get_table()
col_feat, col_tag = 'cf_feat:' + feattype, 'cf_tag:' + tagtype
for key, data in self.table.scan(columns=[col_feat, col_tag]):
X.append(json.loads(data[col_feat]))
|
b9990e77
Chunk
staged.
|
302
303
|
Y.append(1 if data[col_tag] == 'True' else 0)
|
25c0c9c9
Chunk
feat.ravel()[[i*3...
|
304
|
elif mode == "spark" or mode == "cluster":
|
e6be6b61
Chunk
import caffe.
|
305
|
if self.sparkcontex == None:
|
bde8352b
Chunk
shuffling.
|
306
307
|
self.sparkcontex = SC.Sparker(host='HPC-server', appname='ImageCV', master='spark://HPC-server:7077')
|
ec755e37
Chunk
cropping.
|
308
|
result = self.sparkcontex.read_habase(self.table_name) # result = {key:[feat,tag],...}
|
bde8352b
Chunk
shuffling.
|
309
|
for feat, tag in result:
|
ec755e37
Chunk
cropping.
|
310
311
|
X.append(feat)
Y.append(tag)
|
84648488
Chunk
reverted.
|
312
|
|
2c2d57c7
Chunk
ILSVRC datapath h...
|
313
|
else:
|
f1fa5b17
Chunk
review & streaming.
|
314
|
raise Exception("Unknown mode!")
|
2c2d57c7
Chunk
ILSVRC datapath h...
|
315
316
|
return X, Y
|
f4fb4381
Chunk
staged.
|
|
|
2c2d57c7
Chunk
ILSVRC datapath h...
|
|
|
51708346
Chunk
final experiments...
|
|
|
2c2d57c7
Chunk
ILSVRC datapath h...
|
|
|
d47ae6ce
Chunk
staged.
|
|
|
f1fa5b17
Chunk
review & streaming.
|
|
|
d47ae6ce
Chunk
staged.
|
|
|
2c2d57c7
Chunk
ILSVRC datapath h...
|
|
|
ad70caf6
Chunk
staged.
|
|
|
2c2d57c7
Chunk
ILSVRC datapath h...
|
|
|
24768a99
Chunk
mode 'hbase' fini...
|
|
|
2c2d57c7
Chunk
ILSVRC datapath h...
|
|
|
489c5608
Chunk
debugging...
|
|
|
2c2d57c7
Chunk
ILSVRC datapath h...
|
|
|
2c2d57c7
Chunk
ILSVRC datapath h...
|
|
|
84648488
Chunk
reverted.
|
|
|
cb798a7f
Chunk
libs & scripts in...
|
|
|
080c30c2
Chunk
F5 lib updated. I...
|
|
|
cb798a7f
Chunk
libs & scripts in...
|
|
|
84648488
Chunk
reverted.
|
|
|
554a7b9a
Chunk
staged.
|
|
|
cb798a7f
Chunk
libs & scripts in...
|
|
|
cb798a7f
Chunk
libs & scripts in...
|
|
|
080c30c2
Chunk
F5 lib updated. I...
|
|
|
cb798a7f
Chunk
libs & scripts in...
|
|
|
84648488
Chunk
reverted.
|
|
|
080c30c2
Chunk
F5 lib updated. I...
|
|
|
2c2d57c7
Chunk
ILSVRC datapath h...
|
|
|
080c30c2
Chunk
F5 lib updated. I...
|
|
|
2c2d57c7
Chunk
ILSVRC datapath h...
|
|
|
080c30c2
Chunk
F5 lib updated. I...
|
|
|
080c30c2
Chunk
F5 lib updated. I...
|
|
|
2c2d57c7
Chunk
ILSVRC datapath h...
|
|
|
84648488
Chunk
reverted.
|
|
|
2c2d57c7
Chunk
ILSVRC datapath h...
|
|
|
84648488
Chunk
reverted.
|
|
|
bde8352b
Chunk
shuffling.
|
|
|
f1fa5b17
Chunk
review & streaming.
|
|
|
2c2d57c7
Chunk
ILSVRC datapath h...
|
|
|
d0be60e7
Chunk
jpeg update.
|
|
|
ec755e37
Chunk
cropping.
|
|
|
bbd2f705
Chunk
cropping.
|
|
|
ec755e37
Chunk
cropping.
|
|
|
b9990e77
Chunk
staged.
|
|
|
84648488
Chunk
reverted.
|
|
|
bde8352b
Chunk
shuffling.
|
|
|
e6be6b61
Chunk
import caffe.
|
|
|
b9990e77
Chunk
staged.
|
|
|
ec755e37
Chunk
cropping.
|
|
|
d0be60e7
Chunk
jpeg update.
|
|
|
b9990e77
Chunk
staged.
|
|
|
2c2d57c7
Chunk
ILSVRC datapath h...
|
|
|
ec755e37
Chunk
cropping.
|
|
|
2c2d57c7
Chunk
ILSVRC datapath h...
|
|
|
ec755e37
Chunk
cropping.
|
|
|
84648488
Chunk
reverted.
|
|
|
02528074
Chunk
staged.
|
|
|
2c2d57c7
Chunk
ILSVRC datapath h...
|
|
|
ec755e37
Chunk
cropping.
|
|
|
02528074
Chunk
staged.
|
|
|
84648488
Chunk
reverted.
|
|
|
2c2d57c7
Chunk
ILSVRC datapath h...
|
|
|
02528074
Chunk
staged.
|
|
|
2c2d57c7
Chunk
ILSVRC datapath h...
|
|
|
cb798a7f
Chunk
libs & scripts in...
|
|
|
2c2d57c7
Chunk
ILSVRC datapath h...
|
|
|
bde8352b
Chunk
shuffling.
|
|
|
2c2d57c7
Chunk
ILSVRC datapath h...
|
|
|
bde8352b
Chunk
shuffling.
|
|
|
84648488
Chunk
reverted.
|
|
|