Blame view

mdata/ILSVRC.py 19.5 KB
2c2d57c7   Chunk   ILSVRC datapath h...
1
2
3
__author__ = 'chunk'

from . import *
84648488   Chunk   reverted.
4
from ..mfeat import IntraBlockDiff
2c2d57c7   Chunk   ILSVRC datapath h...
5
6
7
8
9
10
11
12
13
14
15
16
17
18
from ..mspark import SC
from ..common import *

import os, sys
from PIL import Image
from hashlib import md5
import csv
import shutil
import json
import collections
import happybase

from ..mjpeg import *
from ..msteg import *
9ff70cf4   Chunk   capacity engeneer...
19
from ..msteg.steganography import LSB, F3, F4, F5
2c2d57c7   Chunk   ILSVRC datapath h...
20

d1042d03   Chunk   staged.
21
22
23
24
import numpy as np
from numpy.random import randn
import pandas as pd
from scipy import stats
ec755e37   Chunk   cropping.
25
import random
d1042d03   Chunk   staged.
26

080c30c2   Chunk   F5 lib updated. I...
27
28
from subprocess import Popen, PIPE, STDOUT

84648488   Chunk   reverted.
29
np.random.seed(sum(map(ord, "whoami")))
d1042d03   Chunk   staged.
30
31

package_dir = os.path.dirname(os.path.abspath(__file__))
080c30c2   Chunk   F5 lib updated. I...
32
33


2c2d57c7   Chunk   ILSVRC datapath h...
34
35
class DataILSVRC(DataDumperBase):
    def __init__(self, base_dir='/media/chunk/Elements/D/data/ImageNet/img/ILSVRC2013_DET_val',
84648488   Chunk   reverted.
36
                 category='Train'):
2c2d57c7   Chunk   ILSVRC datapath h...
37
38
39
40
41
42
43
44
45
46
47
48
49
50
        DataDumperBase.__init__(self, base_dir, category)

        self.base_dir = base_dir
        self.category = category
        self.data_dir = os.path.join(self.base_dir, self.category)

        self.dst_dir = os.path.join(self.base_dir, 'dst', self.category)
        self.list_file = os.path.join(self.dst_dir, 'file-tag.tsv')
        self.feat_dir = os.path.join(self.dst_dir, 'Feat')
        self.img_dir = os.path.join(self.dst_dir, 'Img')

        self.dict_data = {}

        self.table_name = self.base_dir.strip('/').split('/')[-1] + '-' + self.category
02528074   Chunk   staged.
51
        self.sparker = None
2c2d57c7   Chunk   ILSVRC datapath h...
52
53

    def format(self):
f1fa5b17   Chunk   review & streaming.
54
        print "formatting..."
2c2d57c7   Chunk   ILSVRC datapath h...
55
56
57
58
59
        self.extract()

    def _hash_copy(self, image):
        if not image.endswith('jpg'):
            img = Image.open(image)
080c30c2   Chunk   F5 lib updated. I...
60
61
            img.save('../res/tmp.jpg', format='JPEG')
            image = '../res/tmp.jpg'
2c2d57c7   Chunk   ILSVRC datapath h...
62
63
64
65
66

        with open(image, 'rb') as f:
            index = md5(f.read()).hexdigest()

        im = Jpeg(image, key=sample_key)
84648488   Chunk   reverted.
67
        self.dict_data[index] = [im.image_width, im.image_height, im.image_width * im.image_height,
9ff70cf4   Chunk   capacity engeneer...
68
                                 im.getCapacity(),
d0be60e7   Chunk   jpeg update.
69
70
                                 im.getQuality()]

2c2d57c7   Chunk   ILSVRC datapath h...
71
72
        # self.dict_data[index] = [im.image_width, im.image_height, os.path.getsize(image), im.getQuality()]

1dc7c44b   Chunk   crawler-hbase-spa...
73
        # origion:
2c2d57c7   Chunk   ILSVRC datapath h...
74
75
76
77
78
79
80
81
82
83
84
        # dir = base + 'Img/Train/' + index[:3]
        dir = os.path.join(self.img_dir, index[:3])
        if not os.path.exists(dir):
            os.makedirs(dir)
        image_path = os.path.join(dir, index[3:] + '.jpg')
        # print image_path

        if not os.path.exists(image_path):
            shutil.copy(image, image_path)
        else:
            pass
554a7b9a   Chunk   staged.
85

84648488   Chunk   reverted.
86
87
88
89
90
    def get_feat(self, image, feattype='ibd', **kwargs):
        # size = kwargs.get('size', (48, 48))
        # if feattype == 'hog':
        #     feater = HOG.FeatHOG(size=size)
        if feattype == 'ibd':
554a7b9a   Chunk   staged.
91
92
93
94
95
96
97
98
            feater = IntraBlockDiff.FeatIntraBlockDiff()
        else:
            raise Exception("Unknown feature type!")

        desc = feater.feat(image)

        return desc

84648488   Chunk   reverted.
99
    def extract_feat(self, feattype='ibd'):
554a7b9a   Chunk   staged.
100
        print "extracting feat..."
f1fa5b17   Chunk   review & streaming.
101
        # if feattype == 'hog':
84648488   Chunk   reverted.
102
103
104
        #     feater = HOG.FeatHOG(size=(48, 48))
        if feattype == 'ibd':
            feater = IntraBlockDiff.FeatIntraBlockDiff()
554a7b9a   Chunk   staged.
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
        else:
            raise Exception("Unknown feature type!")

        list_image = []
        with open(self.list_file, 'rb') as tsvfile:
            tsvfile = csv.reader(tsvfile, delimiter='\t')
            for line in tsvfile:
                list_image.append(line[0])

        dict_featbuf = {}
        for imgname in list_image:
            # if imgtag == 'True':
            image = os.path.join(self.img_dir, imgname[:3], imgname[3:] + '.jpg')
            desc = feater.feat(image)
            dict_featbuf[imgname] = desc

        for imgname, desc in dict_featbuf.items():
            # print imgname, desc
            dir = os.path.join(self.feat_dir, imgname[:3])
            if not os.path.exists(dir):
                os.makedirs(dir)
            featpath = os.path.join(dir, imgname[3:].split('.')[0] + '.' + feattype)
            with open(featpath, 'wb') as featfile:
                featfile.write(json.dumps(desc.tolist()))

    def _build_list(self, list_file=None):
080c30c2   Chunk   F5 lib updated. I...
131
132
133
134
        if list_file == None:
            list_file = self.list_file
        assert list_file != None

2c2d57c7   Chunk   ILSVRC datapath h...
135
136
137
        ordict_img = collections.OrderedDict(sorted(self.dict_data.items(), key=lambda d: d[0]))

        with open(list_file, 'w') as f:
080c30c2   Chunk   F5 lib updated. I...
138
            tsvfile = csv.writer(f, delimiter='\t')
2c2d57c7   Chunk   ILSVRC datapath h...
139
140
141
142
            for key, value in ordict_img.items():
                tsvfile.writerow([key] + value)

    def _anaylis(self, list_file=None):
080c30c2   Chunk   F5 lib updated. I...
143
144
145
146
147
        if list_file == None:
            list_file = self.list_file
        assert list_file != None

        df_ILS = pd.read_csv(list_file,
84648488   Chunk   reverted.
148
                             names=['hash', 'width', 'height', 'size', 'capacity', 'quality'],
d1042d03   Chunk   staged.
149
                             sep='\t')
9ff70cf4   Chunk   capacity engeneer...
150
        length = df_ILS.shape[0]
9371f8fa   Chunk   SVM param engenee...
151
        df_ILS = df_ILS.sort(['capacity', 'size', 'quality'], ascending=True)
d1042d03   Chunk   staged.
152
        rand_class = stats.bernoulli.rvs(0.8, size=length)
9ff70cf4   Chunk   capacity engeneer...
153

d0be60e7   Chunk   jpeg update.
154
155
        df_ILS['rate'] = np.zeros(df_ILS.shape[0], np.float64)
        df_ILS['chosen'] = rand_class
d1042d03   Chunk   staged.
156
        df_ILS['class'] = np.zeros(length, np.int32)
d0be60e7   Chunk   jpeg update.
157

2c2d57c7   Chunk   ILSVRC datapath h...
158
159
        df_ILS.to_csv(list_file, header=False, index=False, sep='\t')

f1fa5b17   Chunk   review & streaming.
160
    def extract(self):
2c2d57c7   Chunk   ILSVRC datapath h...
161
162
163
        print "extracting data..."
        for path, subdirs, files in os.walk(self.data_dir):
            for name in files:
d1042d03   Chunk   staged.
164
165
166
167
168
                imagepath = os.path.join(path, name)
                # print imagepath
                try:
                    self._hash_copy(imagepath)
                except:
2c2d57c7   Chunk   ILSVRC datapath h...
169
                    pass
d1042d03   Chunk   staged.
170
171

        self._build_list()
2c2d57c7   Chunk   ILSVRC datapath h...
172
        self._anaylis()
84648488   Chunk   reverted.
173

9ff70cf4   Chunk   capacity engeneer...
174
    def _embed_outer(self):
080c30c2   Chunk   F5 lib updated. I...
175
176
177
        self.dict_data = {}
        dict_embedresult = {}
        os.environ["CLASSPATH"] = os.path.join(package_dir, "../libs/F5/")
d0be60e7   Chunk   jpeg update.
178
        cmd = 'java Embed %s %s -e %s  -p password -c "stegan by chunk  " -q %d'
080c30c2   Chunk   F5 lib updated. I...
179

9ff70cf4   Chunk   capacity engeneer...
180
        df_ILS = pd.read_csv(self.list_file,
84648488   Chunk   reverted.
181
                             names=['hash', 'width', 'height', 'size', 'capacity', 'quality',
080c30c2   Chunk   F5 lib updated. I...
182
                                    'chosen', 'class'],
d0be60e7   Chunk   jpeg update.
183
                             sep='\t')
9ff70cf4   Chunk   capacity engeneer...
184
        df_ILS_TARGET = df_ILS[df_ILS['chosen'] == 1]
84648488   Chunk   reverted.
185

d0be60e7   Chunk   jpeg update.
186
        for hash, size, quality in zip(df_ILS_TARGET['hash'], df_ILS_TARGET['size'],
080c30c2   Chunk   F5 lib updated. I...
187
188
                                       df_ILS_TARGET['quality']):
            path_img = os.path.join(self.img_dir, hash[:3], hash[3:] + '.jpg')
84648488   Chunk   reverted.
189
            if path_img:
9ff70cf4   Chunk   capacity engeneer...
190
                print path_img
d0be60e7   Chunk   jpeg update.
191
                p = Popen(cmd % (path_img, 'res/tmp.jpg', 'res/toembed', quality), shell=True,
080c30c2   Chunk   F5 lib updated. I...
192
193
194
195
                          stdout=PIPE,
                          stderr=STDOUT)
                dict_embedresult[hash] = [line.strip('\n') for line in p.stdout.readlines()]
                try:
9ff70cf4   Chunk   capacity engeneer...
196
                    self._hash_copy('res/tmp.jpg')
080c30c2   Chunk   F5 lib updated. I...
197
198
199
200
201
                except:
                    pass

        with open(self.list_file + '.embed.log', 'wb') as f:
            tsvfile = csv.writer(f, delimiter='\t')
080c30c2   Chunk   F5 lib updated. I...
202
203
            for key, value in dict_embedresult.items():
                tsvfile.writerow([key] + value)
d0be60e7   Chunk   jpeg update.
204

84648488   Chunk   reverted.
205
        self._build_list(self.list_file + '.embed')
d0be60e7   Chunk   jpeg update.
206
207
208
209
210
211
212
213

        # merge
        df_ILS_EMBED = pd.read_csv(self.list_file + '.embed',
                                   names=['hash', 'width', 'height', 'size', 'quality'],
                                   sep='\t')
        length = df_ILS_EMBED.shape[0]
        df_ILS_EMBED = df_ILS_EMBED.sort(['size', 'quality'], ascending=True)
        df_ILS_EMBED['chosen'] = np.zeros(length, np.int32)
080c30c2   Chunk   F5 lib updated. I...
214
        df_ILS_EMBED['class'] = np.ones(length, np.int32)
9ff70cf4   Chunk   capacity engeneer...
215
216
217
218
219

        df_ILS = df_ILS.append(df_ILS_EMBED, ignore_index=True)
        df_ILS.to_csv(self.list_file, header=False, index=False, sep='\t')

    def _embed_inner(self, rate=None):
84648488   Chunk   reverted.
220
        self.dict_data = {}
9ff70cf4   Chunk   capacity engeneer...
221
222
223
224
225
226
227
228
229
        f5 = F5.F5(sample_key, 1)
        tmp_img = os.path.join(package_dir, '../res/tmp.jpg')
        df_ILS = pd.read_csv(self.list_file,
                             names=['hash', 'width', 'height', 'size', 'capacity', 'quality',
                                    'rate', 'chosen',
                                    'class'],
                             sep='\t')
        df_ILS_TARGET = df_ILS[df_ILS['chosen'] == 1]

84648488   Chunk   reverted.
230
        for hash, capacity in zip(df_ILS_TARGET['hash'], df_ILS_TARGET['capacity']):
9ff70cf4   Chunk   capacity engeneer...
231
232
            path_img = os.path.join(self.img_dir, hash[:3], hash[3:] + '.jpg')
            if path_img:
9371f8fa   Chunk   SVM param engenee...
233
                print path_img
9ff70cf4   Chunk   capacity engeneer...
234
235
236
237
238
239
                if rate == None:
                    embed_rate = f5.embed_raw_data(path_img,
                                                   os.path.join(package_dir, '../res/toembed'),
                                                   tmp_img)
                else:
                    assert (rate >= 0 and rate < 1)
84648488   Chunk   reverted.
240
                    # print capacity
9ff70cf4   Chunk   capacity engeneer...
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
                    hidden = np.random.bytes(int(capacity * rate) / 8)
                    embed_rate = f5.embed_raw_data(path_img, hidden, tmp_img, frommem=True)
                try:
                    with open(tmp_img, 'rb') as f:
                        index = md5(f.read()).hexdigest()
                    im = Jpeg(tmp_img, key=sample_key)
                    self.dict_data[index] = [im.image_width, im.image_height,
                                             im.image_width * im.image_height,
                                             im.getCapacity(),
                                             im.getQuality(), embed_rate]

                    dir = os.path.join(self.img_dir, index[:3])
                    if not os.path.exists(dir):
                        os.makedirs(dir)
                    image_path = os.path.join(dir, index[3:] + '.jpg')
                    if not os.path.exists(image_path):
                        shutil.copy(tmp_img, image_path)
                    else:
84648488   Chunk   reverted.
259
                        pass
9ff70cf4   Chunk   capacity engeneer...
260
261
262
263
264
265
266
267
268
269
270
271
                except:
                    pass

        self._build_list(self.list_file + '.embed')

        # merge
        df_ILS_EMBED = pd.read_csv(self.list_file + '.embed',
                                   names=['hash', 'width', 'height', 'size', 'capacity', 'quality',
                                          'rate'],
                                   sep='\t')

        df_ILS_EMBED = df_ILS_EMBED.sort(['rate', 'capacity', 'size', 'quality'], ascending=True)
554a7b9a   Chunk   staged.
272
        df_ILS_EMBED['chosen'] = np.zeros(df_ILS_EMBED.shape[0], np.int32)
f1fa5b17   Chunk   review & streaming.
273
        df_ILS_EMBED['class'] = np.ones(df_ILS_EMBED.shape[0], np.int32)
9371f8fa   Chunk   SVM param engenee...
274

9ff70cf4   Chunk   capacity engeneer...
275
        # print df_ILS_EMBED.dtypes
84648488   Chunk   reverted.
276
        # print df_ILS.dtypes
ec755e37   Chunk   cropping.
277
        # Form the intersection of two Index objects. Sortedness of the result is not guaranteed
e6be6b61   Chunk   import caffe.
278
279
280
        df_ILS = df_ILS.append(df_ILS_EMBED, ignore_index=True)
        df_ILS.to_csv(self.list_file, header=False, index=False, sep='\t')

ec755e37   Chunk   cropping.
281
282
283
284
285
286
287
288
    def embed(self, rate=None):
        print "embedding data..."
        self._embed_inner(rate)

    def crop(self, size=(300, 300)):
        cropped_dir = self.data_dir + '_crop_pil'
        if not os.path.exists(cropped_dir):
            os.makedirs(cropped_dir)
b9990e77   Chunk   staged.
289
        for path, subdirs, files in os.walk(self.data_dir):
ec755e37   Chunk   cropping.
290
291
292
293
294
            for name in files:
                image = os.path.join(path, name)
                print image

                W, H = size
e6be6b61   Chunk   import caffe.
295
                try:
ec755e37   Chunk   cropping.
296
297
298
299
300
301
                    im = Image.open(image)
                    qt = im.quantization
                    w, h = im.size
                    if w < W or h < H:
                        continue
                    left, upper = random.randint(0, w - W), random.randint(0, h - H)
b9990e77   Chunk   staged.
302
303
                    im = im.crop((left, upper, left + W, upper + H))
                    im.save(os.path.join(cropped_dir, name), qtables=qt)
25c0c9c9   Chunk   feat.ravel()[[i*3...
304
                except Exception as e:
e6be6b61   Chunk   import caffe.
305
                    print '[EXCPT]', e
bde8352b   Chunk   shuffling.
306
307
                    pass

ec755e37   Chunk   cropping.
308
                    # try:
bde8352b   Chunk   shuffling.
309
                    # img = cv2.imread(image, cv2.CV_LOAD_IMAGE_UNCHANGED)
ec755e37   Chunk   cropping.
310
311
                    # h, w = img.shape[:2]
                    # if w < 300 or h < 300:
84648488   Chunk   reverted.
312
                    # continue
2c2d57c7   Chunk   ILSVRC datapath h...
313
                    # left, upper = random.randint(0, w - 300), random.randint(0, h - 300)
f1fa5b17   Chunk   review & streaming.
314
                    # img_crop = img[upper:upper + 300, left:left + 300]
2c2d57c7   Chunk   ILSVRC datapath h...
315
316
317
318
319
320
321
322
323
                    # cv2.imwrite(os.path.join(base_dir, category + '_crop_cv', name), img_crop)
                    # except Exception as e:
                    # print '[EXCPT]', e
                    #     pass

    def get_table(self):
        print "getting table..."
        if self.table != None:
            return self.table
f4fb4381   Chunk   staged.
324
325
326
327
328

        if self.connection is None:
            c = happybase.Connection('HPC-server')
            self.connection = c

2c2d57c7   Chunk   ILSVRC datapath h...
329
330
331
332
333
        tables = self.connection.tables()
        if self.table_name not in tables:
            families = {'cf_pic': dict(),
                        'cf_info': dict(max_versions=10),
                        'cf_tag': dict(),
51708346   Chunk   final experiments...
334
                        'cf_feat': dict(),
2c2d57c7   Chunk   ILSVRC datapath h...
335
336
337
338
339
340
341
                        }
            self.connection.create_table(name=self.table_name, families=families)

        table = self.connection.table(name=self.table_name)

        self.table = table

d47ae6ce   Chunk   staged.
342
        return table
f1fa5b17   Chunk   review & streaming.
343

d47ae6ce   Chunk   staged.
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
    def delete_table(self, table_name=None, disable=True):
        print "deleting table..."
        if table_name == None:
            table_name = self.table_name

        if self.connection is None:
            c = happybase.Connection('HPC-server')
            self.connection = c

        tables = self.connection.tables()
        if table_name not in tables:
            return False
        else:
            try:
                self.connection.delete_table(table_name, disable)
            except:
                print 'Exception when deleting table.'
2c2d57c7   Chunk   ILSVRC datapath h...
361
                raise
ad70caf6   Chunk   staged.
362
        return True
2c2d57c7   Chunk   ILSVRC datapath h...
363
364
365
366
367
368
369
370
371

    def store_img(self):
        if self.table == None:
            self.table = self.get_table()

        dict_databuf = {}

        with open(self.list_file, 'rb') as tsvfile:
            tsvfile = csv.reader(tsvfile, delimiter='\t')
24768a99   Chunk   mode 'hbase' fini...
372
            for line in tsvfile:
2c2d57c7   Chunk   ILSVRC datapath h...
373
374
375
376
                path_img = os.path.join(self.img_dir, line[0][:3], line[0][3:] + '.jpg')
                if path_img:
                    with open(path_img, 'rb') as fpic:
                        dict_databuf[line[0] + '.jpg'] = fpic.read()
489c5608   Chunk   debugging...
377

2c2d57c7   Chunk   ILSVRC datapath h...
378
379
380
381
        try:
            with self.table.batch(batch_size=2000) as b:
                for imgname, imgdata in dict_databuf.items():
                    b.put(imgname, {'cf_pic:data': imgdata})
2c2d57c7   Chunk   ILSVRC datapath h...
382
        except ValueError:
84648488   Chunk   reverted.
383
            raise
cb798a7f   Chunk   libs & scripts in...
384
385
386
387
388
389
390
391
392

    def store_info(self, infotype='all'):
        if self.table == None:
            self.table = self.get_table()

        dict_infobuf = {}

        with open(self.list_file, 'rb') as tsvfile:
            tsvfile = csv.reader(tsvfile, delimiter='\t')
080c30c2   Chunk   F5 lib updated. I...
393
            for line in tsvfile:
cb798a7f   Chunk   libs & scripts in...
394
395
396
397
398
399
                dict_infobuf[line[0] + '.jpg'] = line[1:-2]

        if infotype == 'all':
            try:
                with self.table.batch(batch_size=5000) as b:
                    for imgname, imginfo in dict_infobuf.items():
84648488   Chunk   reverted.
400
                        b.put(imgname,
554a7b9a   Chunk   staged.
401
402
                              {'cf_info:width': imginfo[0], 'cf_info:height': imginfo[1],
                               'cf_info:size': imginfo[2],
cb798a7f   Chunk   libs & scripts in...
403
404
                               'cf_info:capacity': imginfo[3],
                               'cf_info:quality': imginfo[4]})
cb798a7f   Chunk   libs & scripts in...
405
            except ValueError:
080c30c2   Chunk   F5 lib updated. I...
406
                raise
cb798a7f   Chunk   libs & scripts in...
407
        else:
84648488   Chunk   reverted.
408
            raise Exception("Unknown infotype!")
080c30c2   Chunk   F5 lib updated. I...
409

2c2d57c7   Chunk   ILSVRC datapath h...
410
411
412
413
414
415
416
417
    def store_tag(self, tagtype='all'):
        if self.table == None:
            self.table = self.get_table()

        dict_tagbuf = {}

        with open(self.list_file, 'rb') as tsvfile:
            tsvfile = csv.reader(tsvfile, delimiter='\t')
080c30c2   Chunk   F5 lib updated. I...
418
            for line in tsvfile:
2c2d57c7   Chunk   ILSVRC datapath h...
419
                dict_tagbuf[line[0] + '.jpg'] = line[-2:]
080c30c2   Chunk   F5 lib updated. I...
420
421
422
423
424
425
426

        if tagtype == 'all':
            try:
                with self.table.batch(batch_size=5000) as b:
                    for imgname, imgtag in dict_tagbuf.items():
                        b.put(imgname, {'cf_tag:chosen': imgtag[0], 'cf_tag:class': imgtag[1]})
            except ValueError:
080c30c2   Chunk   F5 lib updated. I...
427
428
                raise
        else:
2c2d57c7   Chunk   ILSVRC datapath h...
429
            raise Exception("Unknown tagtype!")
84648488   Chunk   reverted.
430

2c2d57c7   Chunk   ILSVRC datapath h...
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
    def store_feat(self, feattype='ibd'):
        if self.table == None:
            self.table = self.get_table()

        dict_featbuf = {}
        for path, subdirs, files in os.walk(self.feat_dir):
            for name in files:
                featpath = os.path.join(path, name)
                # print featpath
                with open(featpath, 'rb') as featfile:
                    imgname = path.split('/')[-1] + name.replace('.' + feattype, '.jpg')
                    dict_featbuf[imgname] = featfile.read()

        try:
            with self.table.batch(batch_size=5000) as b:
                for imgname, featdesc in dict_featbuf.items():
                    b.put(imgname, {'cf_feat:' + feattype: featdesc})
        except ValueError:
            raise
            pass

84648488   Chunk   reverted.
452
    def load_data(self, mode='local', feattype='ibd', tagtype='class', shuffle=False):
bde8352b   Chunk   shuffling.
453
        print "loading data..."
f1fa5b17   Chunk   review & streaming.
454
        INDEX = []
2c2d57c7   Chunk   ILSVRC datapath h...
455
456
457
458
459
460
        X = []
        Y = []

        if mode == "local":

            dict_dataset = {}
d0be60e7   Chunk   jpeg update.
461
462

            if feattype == 'coef':  # raw
ec755e37   Chunk   cropping.
463
464
465
466
467
468
                with open(self.list_file, 'rb') as tsvfile:
                    tsvfile = csv.reader(tsvfile, delimiter='\t')
                    for line in tsvfile:
                        hash = line[0]
                        tag = line[-1]
                        image = os.path.join(self.img_dir, hash[:3], hash[3:] + '.jpg')
bbd2f705   Chunk   cropping.
469
                        if image:
ec755e37   Chunk   cropping.
470
471
                            im = Jpeg(image, key=sample_key)
                            dict_dataset[hash] = (tag, im.getCoefMatrix(channel='Y'))
b9990e77   Chunk   staged.
472
473
474

                for tag, feat in dict_dataset.values():
                    feat.ravel()[
84648488   Chunk   reverted.
475
                        [i * 200 + j for i in range(0, 200, 8) for j in range(0, 200, 8)]] = 0
bde8352b   Chunk   shuffling.
476
477
                    feat = np.absolute(feat)
                    feat = np.bitwise_and(feat, 1)
e6be6b61   Chunk   import caffe.
478
                    X.append(feat.ravel())
b9990e77   Chunk   staged.
479
                    Y.append(int(tag))
ec755e37   Chunk   cropping.
480
481
482
483
484
485
486
487
488
489
490

            else:
                with open(self.list_file, 'rb') as tsvfile:
                    tsvfile = csv.reader(tsvfile, delimiter='\t')
                    for line in tsvfile:
                        hash = line[0]
                        tag = line[-1]
                        path_feat = os.path.join(self.feat_dir, hash[:3], hash[3:] + '.' + feattype)
                        if path_feat:
                            with open(path_feat, 'rb') as featfile:
                                dict_dataset[hash] = (tag, json.loads(featfile.read()))
d0be60e7   Chunk   jpeg update.
491

b9990e77   Chunk   staged.
492
493
494
495
                for tag, feat in dict_dataset.values():
                    # X.append([item for sublist in feat for subsublist in sublist for item in subsublist])
                    X.append(np.array(feat).ravel().tolist())
                    Y.append(int(tag))
2c2d57c7   Chunk   ILSVRC datapath h...
496

ec755e37   Chunk   cropping.
497
        elif mode == "hbase":  # remote
2c2d57c7   Chunk   ILSVRC datapath h...
498
499
500
501
502
            if self.table == None:
                self.table = self.get_table()

            col_feat, col_tag = 'cf_feat:' + feattype, 'cf_tag:' + tagtype
            for key, data in self.table.scan(columns=[col_feat, col_tag]):
ec755e37   Chunk   cropping.
503
                X.append(
84648488   Chunk   reverted.
504
                    [item for sublist in json.loads(data[col_feat]) for subsublist in sublist for
02528074   Chunk   staged.
505
                     item in subsublist])
2c2d57c7   Chunk   ILSVRC datapath h...
506
                Y.append(int(data[col_tag]))
ec755e37   Chunk   cropping.
507

02528074   Chunk   staged.
508
        elif mode == "spark":  # cluster
84648488   Chunk   reverted.
509
            if self.sparker == None:
2c2d57c7   Chunk   ILSVRC datapath h...
510
                self.sparker = SC.Sparker(host='HPC-server', appname='ImageCV',
02528074   Chunk   staged.
511
                                          master='spark://HPC-server:7077')
2c2d57c7   Chunk   ILSVRC datapath h...
512
513
514

            result = self.sparker.read_hbase(self.table_name)  # result = {key:[feat,tag],...}
            for feat, tag in result:
cb798a7f   Chunk   libs & scripts in...
515
                X.append(feat)
2c2d57c7   Chunk   ILSVRC datapath h...
516
517
518
                Y.append(tag)

        else:
bde8352b   Chunk   shuffling.
519
520
521
522
523
            raise Exception("Unknown mode!")

        if shuffle:
            # shuffling
            Z = zip(X, Y)
2c2d57c7   Chunk   ILSVRC datapath h...
524
            np.random.shuffle(Z)
bde8352b   Chunk   shuffling.
525
            return Z
84648488   Chunk   reverted.
526
527

        return X, Y