Blame view

mdata/ILSVRC.py 19.4 KB
2c2d57c7   Chunk   ILSVRC datapath h...
1
2
3
__author__ = 'chunk'

from . import *
84648488   Chunk   reverted.
4
from ..mfeat import IntraBlockDiff
2c2d57c7   Chunk   ILSVRC datapath h...
5
6
7
8
9
10
11
12
13
14
15
16
17
18
from ..mspark import rdd, SC
from ..common import *

import os, sys
from PIL import Image
from hashlib import md5
import csv
import shutil
import json
import collections
import happybase

from ..mjpeg import *
from ..msteg import *
9ff70cf4   Chunk   capacity engeneer...
19
from ..msteg.steganography import LSB, F3, F4, F5
2c2d57c7   Chunk   ILSVRC datapath h...
20

d1042d03   Chunk   staged.
21
22
23
24
import numpy as np
from numpy.random import randn
import pandas as pd
from scipy import stats
ec755e37   Chunk   cropping.
25
import random
d1042d03   Chunk   staged.
26

080c30c2   Chunk   F5 lib updated. I...
27
28
from subprocess import Popen, PIPE, STDOUT

84648488   Chunk   reverted.
29

d1042d03   Chunk   staged.
30
31
np.random.seed(sum(map(ord, "whoami")))

080c30c2   Chunk   F5 lib updated. I...
32
33
package_dir = os.path.dirname(os.path.abspath(__file__))

2c2d57c7   Chunk   ILSVRC datapath h...
34
35

class DataILSVRC(DataDumperBase):
84648488   Chunk   reverted.
36
    def __init__(self, base_dir='/media/chunk/Elements/D/data/ImageNet/img/ILSVRC2013_DET_val', category='Train',
2c2d57c7   Chunk   ILSVRC datapath h...
37
38
39
40
41
42
43
44
45
46
47
48
49
50
                 host='HPC-server'):
        DataDumperBase.__init__(self, base_dir, category)

        self.base_dir = base_dir
        self.category = category
        self.data_dir = os.path.join(self.base_dir, self.category)

        self.dst_dir = os.path.join(self.base_dir, 'dst', self.category)
        self.list_file = os.path.join(self.dst_dir, 'file-tag.tsv')
        self.feat_dir = os.path.join(self.dst_dir, 'Feat')
        self.img_dir = os.path.join(self.dst_dir, 'Img')

        self.host = host
        self.master = 'spark://%s:7077' % self.host
02528074   Chunk   staged.
51
        self.appname = 'ImageILSVRC'
2c2d57c7   Chunk   ILSVRC datapath h...
52
53

        self.dict_data = {}
f1fa5b17   Chunk   review & streaming.
54

2c2d57c7   Chunk   ILSVRC datapath h...
55
56
57
58
59
        self.table_name = self.base_dir.strip('/').split('/')[-1] + '-' + self.category
        self.sparker = None

    def format(self):
        print "formatting..."
080c30c2   Chunk   F5 lib updated. I...
60
61
        self.extract()

2c2d57c7   Chunk   ILSVRC datapath h...
62
63
64
65
66
    def _hash_copy(self, image):
        if not image.endswith('jpg'):
            img = Image.open(image)
            img.save('../res/tmp.jpg', format='JPEG')
            image = '../res/tmp.jpg'
84648488   Chunk   reverted.
67

9ff70cf4   Chunk   capacity engeneer...
68
        with open(image, 'rb') as f:
d0be60e7   Chunk   jpeg update.
69
70
            index = md5(f.read()).hexdigest()

2c2d57c7   Chunk   ILSVRC datapath h...
71
72
        im = Jpeg(image, key=sample_key)
        self.dict_data[index] = [im.image_width, im.image_height, im.image_width * im.image_height, im.getCapacity(),
1dc7c44b   Chunk   crawler-hbase-spa...
73
                                 im.getQuality()]
2c2d57c7   Chunk   ILSVRC datapath h...
74
75
76
77
78
79
80
81
82
83
84

        # self.dict_data[index] = [im.image_width, im.image_height, os.path.getsize(image), im.getQuality()]

        # origion:
        # dir = base + 'Img/Train/' + index[:3]
        dir = os.path.join(self.img_dir, index[:3])
        if not os.path.exists(dir):
            os.makedirs(dir)
        image_path = os.path.join(dir, index[3:] + '.jpg')
        # print image_path

554a7b9a   Chunk   staged.
85
        if not os.path.exists(image_path):
84648488   Chunk   reverted.
86
87
88
89
90
            shutil.copy(image, image_path)
        else:
            pass

    def get_feat(self, image, feattype='ibd', **kwargs):
554a7b9a   Chunk   staged.
91
92
93
94
95
96
97
98
        # size = kwargs.get('size', (48, 48))
        #
        # if feattype == 'hog':
        # feater = HOG.FeatHOG(size=size)
        if feattype == 'ibd':
            feater = IntraBlockDiff.FeatIntraBlockDiff()
        else:
            raise Exception("Unknown feature type!")
84648488   Chunk   reverted.
99

554a7b9a   Chunk   staged.
100
        desc = feater.feat(image)
f1fa5b17   Chunk   review & streaming.
101

84648488   Chunk   reverted.
102
103
104
        return desc


554a7b9a   Chunk   staged.
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
    def extract_feat(self, feattype='ibd'):
        print "extracting feat..."
        # if feattype == 'hog':
        # feater = HOG.FeatHOG(size=(48, 48))
        if feattype == 'ibd':
            feater = IntraBlockDiff.FeatIntraBlockDiff()
        else:
            raise Exception("Unknown feature type!")

        list_image = []
        with open(self.list_file, 'rb') as tsvfile:
            tsvfile = csv.reader(tsvfile, delimiter='\t')
            for line in tsvfile:
                list_image.append(line[0])

        dict_featbuf = {}
        for imgname in list_image:
            # if imgtag == 'True':
            image = os.path.join(self.img_dir, imgname[:3], imgname[3:] + '.jpg')
            desc = feater.feat(image)
            dict_featbuf[imgname] = desc

        for imgname, desc in dict_featbuf.items():
            # print imgname, desc
            dir = os.path.join(self.feat_dir, imgname[:3])
            if not os.path.exists(dir):
080c30c2   Chunk   F5 lib updated. I...
131
132
133
134
                os.makedirs(dir)
            featpath = os.path.join(dir, imgname[3:].split('.')[0] + '.' + feattype)
            with open(featpath, 'wb') as featfile:
                featfile.write(json.dumps(desc.tolist()))
2c2d57c7   Chunk   ILSVRC datapath h...
135
136
137

    def _build_list(self, list_file=None):
        if list_file == None:
080c30c2   Chunk   F5 lib updated. I...
138
            list_file = self.list_file
2c2d57c7   Chunk   ILSVRC datapath h...
139
140
141
142
        assert list_file != None

        ordict_img = collections.OrderedDict(sorted(self.dict_data.items(), key=lambda d: d[0]))

080c30c2   Chunk   F5 lib updated. I...
143
144
145
146
147
        with open(list_file, 'w') as f:
            tsvfile = csv.writer(f, delimiter='\t')
            for key, value in ordict_img.items():
                tsvfile.writerow([key] + value)

84648488   Chunk   reverted.
148
    def _anaylis(self, list_file=None):
d1042d03   Chunk   staged.
149
        if list_file == None:
9ff70cf4   Chunk   capacity engeneer...
150
            list_file = self.list_file
9371f8fa   Chunk   SVM param engenee...
151
        assert list_file != None
d1042d03   Chunk   staged.
152

9ff70cf4   Chunk   capacity engeneer...
153
        df_ILS = pd.read_csv(list_file, names=['hash', 'width', 'height', 'size', 'capacity', 'quality'], sep='\t')
d0be60e7   Chunk   jpeg update.
154
155
        length = df_ILS.shape[0]
        df_ILS = df_ILS.sort(['capacity', 'size', 'quality'], ascending=True)
d1042d03   Chunk   staged.
156
        rand_class = stats.bernoulli.rvs(0.8, size=length)
d0be60e7   Chunk   jpeg update.
157

2c2d57c7   Chunk   ILSVRC datapath h...
158
159
        df_ILS['rate'] = np.zeros(df_ILS.shape[0], np.float64)
        df_ILS['chosen'] = rand_class
f1fa5b17   Chunk   review & streaming.
160
        df_ILS['class'] = np.zeros(length, np.int32)
2c2d57c7   Chunk   ILSVRC datapath h...
161
162
163

        df_ILS.to_csv(list_file, header=False, index=False, sep='\t')

d1042d03   Chunk   staged.
164
165
166
167
168
    def extract(self):
        print "extracting data..."
        for path, subdirs, files in os.walk(self.data_dir):
            for name in files:
                imagepath = os.path.join(path, name)
2c2d57c7   Chunk   ILSVRC datapath h...
169
                # print imagepath
d1042d03   Chunk   staged.
170
171
                try:
                    self._hash_copy(imagepath)
2c2d57c7   Chunk   ILSVRC datapath h...
172
                except:
84648488   Chunk   reverted.
173
                    pass
9ff70cf4   Chunk   capacity engeneer...
174

080c30c2   Chunk   F5 lib updated. I...
175
176
177
        self._build_list()
        self._anaylis()

d0be60e7   Chunk   jpeg update.
178

080c30c2   Chunk   F5 lib updated. I...
179
    def _embed_outer(self):
9ff70cf4   Chunk   capacity engeneer...
180
        self.dict_data = {}
84648488   Chunk   reverted.
181
        dict_embedresult = {}
080c30c2   Chunk   F5 lib updated. I...
182
        os.environ["CLASSPATH"] = os.path.join(package_dir, "../libs/F5/")
d0be60e7   Chunk   jpeg update.
183
        cmd = 'java Embed %s %s -e %s  -p password -c "stegan by chunk  " -q %d'
9ff70cf4   Chunk   capacity engeneer...
184

84648488   Chunk   reverted.
185
        df_ILS = pd.read_csv(self.list_file,
d0be60e7   Chunk   jpeg update.
186
                             names=['hash', 'width', 'height', 'size', 'capacity', 'quality', 'chosen', 'class'],
080c30c2   Chunk   F5 lib updated. I...
187
188
                             sep='\t')
        df_ILS_TARGET = df_ILS[df_ILS['chosen'] == 1]
84648488   Chunk   reverted.
189

9ff70cf4   Chunk   capacity engeneer...
190
        for hash, size, quality in zip(df_ILS_TARGET['hash'], df_ILS_TARGET['size'], df_ILS_TARGET['quality']):
d0be60e7   Chunk   jpeg update.
191
            path_img = os.path.join(self.img_dir, hash[:3], hash[3:] + '.jpg')
080c30c2   Chunk   F5 lib updated. I...
192
193
194
195
            if path_img:
                print path_img
                p = Popen(cmd % (path_img, 'res/tmp.jpg', 'res/toembed', quality), shell=True, stdout=PIPE,
                          stderr=STDOUT)
9ff70cf4   Chunk   capacity engeneer...
196
                dict_embedresult[hash] = [line.strip('\n') for line in p.stdout.readlines()]
080c30c2   Chunk   F5 lib updated. I...
197
198
199
200
201
                try:
                    self._hash_copy('res/tmp.jpg')
                except:
                    pass

080c30c2   Chunk   F5 lib updated. I...
202
203
        with open(self.list_file + '.embed.log', 'wb') as f:
            tsvfile = csv.writer(f, delimiter='\t')
d0be60e7   Chunk   jpeg update.
204
            for key, value in dict_embedresult.items():
84648488   Chunk   reverted.
205
                tsvfile.writerow([key] + value)
d0be60e7   Chunk   jpeg update.
206
207
208
209
210
211
212
213

        self._build_list(self.list_file + '.embed')

        # merge
        df_ILS_EMBED = pd.read_csv(self.list_file + '.embed', names=['hash', 'width', 'height', 'size', 'quality'],
                                   sep='\t')
        length = df_ILS_EMBED.shape[0]
        df_ILS_EMBED = df_ILS_EMBED.sort(['size', 'quality'], ascending=True)
080c30c2   Chunk   F5 lib updated. I...
214
        df_ILS_EMBED['chosen'] = np.zeros(length, np.int32)
9ff70cf4   Chunk   capacity engeneer...
215
216
217
218
219
        df_ILS_EMBED['class'] = np.ones(length, np.int32)

        df_ILS = df_ILS.append(df_ILS_EMBED, ignore_index=True)
        df_ILS.to_csv(self.list_file, header=False, index=False, sep='\t')

84648488   Chunk   reverted.
220
    def _embed_inner(self, rate=None):
9ff70cf4   Chunk   capacity engeneer...
221
222
223
224
225
226
227
228
229
        self.dict_data = {}
        f5 = F5.F5(sample_key, 1)
        tmp_img = os.path.join(package_dir, '../res/tmp.jpg')
        df_ILS = pd.read_csv(self.list_file,
                             names=['hash', 'width', 'height', 'size', 'capacity', 'quality', 'rate', 'chosen',
                                    'class'],
                             sep='\t')
        df_ILS_TARGET = df_ILS[df_ILS['chosen'] == 1]

84648488   Chunk   reverted.
230
        for hash, capacity in zip(df_ILS_TARGET['hash'], df_ILS_TARGET['capacity']):
9ff70cf4   Chunk   capacity engeneer...
231
232
            path_img = os.path.join(self.img_dir, hash[:3], hash[3:] + '.jpg')
            if path_img:
9371f8fa   Chunk   SVM param engenee...
233
                print path_img
9ff70cf4   Chunk   capacity engeneer...
234
235
236
237
238
239
                if rate == None:
                    embed_rate = f5.embed_raw_data(path_img, os.path.join(package_dir, '../res/toembed'), tmp_img)
                else:
                    assert (rate >= 0 and rate < 1)
                    # print capacity
                    hidden = np.random.bytes(int(capacity * rate) / 8)
84648488   Chunk   reverted.
240
                    embed_rate = f5.embed_raw_data(path_img, hidden, tmp_img, frommem=True)
9ff70cf4   Chunk   capacity engeneer...
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
                try:
                    with open(tmp_img, 'rb') as f:
                        index = md5(f.read()).hexdigest()
                    im = Jpeg(tmp_img, key=sample_key)
                    self.dict_data[index] = [im.image_width, im.image_height, im.image_width * im.image_height,
                                             im.getCapacity(),
                                             im.getQuality(), embed_rate]

                    dir = os.path.join(self.img_dir, index[:3])
                    if not os.path.exists(dir):
                        os.makedirs(dir)
                    image_path = os.path.join(dir, index[3:] + '.jpg')
                    if not os.path.exists(image_path):
                        shutil.copy(tmp_img, image_path)
                    else:
                        pass
                except:
                    pass
84648488   Chunk   reverted.
259

9ff70cf4   Chunk   capacity engeneer...
260
261
262
263
264
265
266
267
268
269
270
271
        self._build_list(self.list_file + '.embed')

        # merge
        df_ILS_EMBED = pd.read_csv(self.list_file + '.embed',
                                   names=['hash', 'width', 'height', 'size', 'capacity', 'quality', 'rate'],
                                   sep='\t')

        df_ILS_EMBED = df_ILS_EMBED.sort(['rate', 'capacity', 'size', 'quality'], ascending=True)
        df_ILS_EMBED['chosen'] = np.zeros(df_ILS_EMBED.shape[0], np.int32)
        df_ILS_EMBED['class'] = np.ones(df_ILS_EMBED.shape[0], np.int32)

        # print df_ILS_EMBED.dtypes
554a7b9a   Chunk   staged.
272
        # print df_ILS.dtypes
f1fa5b17   Chunk   review & streaming.
273
        # Form the intersection of two Index objects. Sortedness of the result is not guaranteed
9371f8fa   Chunk   SVM param engenee...
274
        df_ILS = df_ILS.append(df_ILS_EMBED, ignore_index=True)
9ff70cf4   Chunk   capacity engeneer...
275
        df_ILS.to_csv(self.list_file, header=False, index=False, sep='\t')
84648488   Chunk   reverted.
276

ec755e37   Chunk   cropping.
277
    def embed(self, rate=None):
e6be6b61   Chunk   import caffe.
278
279
280
        print "embedding data..."
        self._embed_inner(rate)

ec755e37   Chunk   cropping.
281
282
283
284
285
286
287
288

    def crop(self, size=(300, 300)):
        cropped_dir = self.data_dir + '_crop_pil'
        if not os.path.exists(cropped_dir):
            os.makedirs(cropped_dir)
        for path, subdirs, files in os.walk(self.data_dir):
            for name in files:
                image = os.path.join(path, name)
b9990e77   Chunk   staged.
289
                print image
ec755e37   Chunk   cropping.
290
291
292
293
294

                W, H = size
                try:
                    im = Image.open(image)
                    qt = im.quantization
e6be6b61   Chunk   import caffe.
295
                    w, h = im.size
ec755e37   Chunk   cropping.
296
297
298
299
300
301
                    if w < W or h < H:
                        continue
                    left, upper = random.randint(0, w - W), random.randint(0, h - H)
                    im = im.crop((left, upper, left + W, upper + H))
                    im.save(os.path.join(cropped_dir, name), qtables=qt)
                except Exception as e:
b9990e77   Chunk   staged.
302
303
                    print '[EXCPT]', e
                    pass
25c0c9c9   Chunk   feat.ravel()[[i*3...
304

e6be6b61   Chunk   import caffe.
305
                    # try:
bde8352b   Chunk   shuffling.
306
307
                    # img = cv2.imread(image, cv2.CV_LOAD_IMAGE_UNCHANGED)
                    # h, w = img.shape[:2]
ec755e37   Chunk   cropping.
308
                    # if w < 300 or h < 300:
bde8352b   Chunk   shuffling.
309
                    # continue
ec755e37   Chunk   cropping.
310
311
                    # left, upper = random.randint(0, w - 300), random.randint(0, h - 300)
                    # img_crop = img[upper:upper + 300, left:left + 300]
84648488   Chunk   reverted.
312
                    # cv2.imwrite(os.path.join(base_dir, category + '_crop_cv', name), img_crop)
2c2d57c7   Chunk   ILSVRC datapath h...
313
                    # except Exception as e:
f1fa5b17   Chunk   review & streaming.
314
                    # print '[EXCPT]', e
2c2d57c7   Chunk   ILSVRC datapath h...
315
316
317
318
319
320
321
322
323
                    # pass


    def get_table(self):
        print "getting table..."
        if self.table != None:
            return self.table

        if self.connection is None:
f4fb4381   Chunk   staged.
324
325
326
327
328
            c = happybase.Connection(host=self.host)
            self.connection = c

        tables = self.connection.tables()
        if self.table_name not in tables:
2c2d57c7   Chunk   ILSVRC datapath h...
329
330
331
332
333
            families_compressed = {'cf_pic': dict(compression='LZO'),
                                   'cf_info': dict(max_versions=10, compression='LZO'),
                                   'cf_tag': dict(compression='LZO'),
                                   'cf_feat': dict(compression='LZO'),
                                   }
51708346   Chunk   final experiments...
334
            families = {'cf_pic': dict(),
2c2d57c7   Chunk   ILSVRC datapath h...
335
336
337
338
339
340
341
                        'cf_info': dict(max_versions=10),
                        'cf_tag': dict(),
                        'cf_feat': dict(),
                        }
            self.connection.create_table(name=self.table_name, families=families)

        table = self.connection.table(name=self.table_name)
d47ae6ce   Chunk   staged.
342

f1fa5b17   Chunk   review & streaming.
343
        self.table = table
d47ae6ce   Chunk   staged.
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360

        return table

    def delete_table(self, table_name=None, disable=True):
        print "deleting table..."
        if table_name == None:
            table_name = self.table_name

        if self.connection is None:
            c = happybase.Connection(host=self.host)
            self.connection = c

        tables = self.connection.tables()
        if table_name not in tables:
            return False
        else:
            try:
2c2d57c7   Chunk   ILSVRC datapath h...
361
                self.connection.delete_table(table_name, disable)
ad70caf6   Chunk   staged.
362
            except:
2c2d57c7   Chunk   ILSVRC datapath h...
363
364
365
366
367
368
369
370
371
                print 'Exception when deleting table.'
                raise
        return True

    def store_img(self):
        if self.table == None:
            self.table = self.get_table()

        dict_databuf = {}
24768a99   Chunk   mode 'hbase' fini...
372

2c2d57c7   Chunk   ILSVRC datapath h...
373
374
375
376
        with open(self.list_file, 'rb') as tsvfile:
            tsvfile = csv.reader(tsvfile, delimiter='\t')
            for line in tsvfile:
                path_img = os.path.join(self.img_dir, line[0][:3], line[0][3:] + '.jpg')
489c5608   Chunk   debugging...
377
                if path_img:
2c2d57c7   Chunk   ILSVRC datapath h...
378
379
380
381
                    with open(path_img, 'rb') as fpic:
                        dict_databuf[line[0] + '.jpg'] = fpic.read()

        try:
2c2d57c7   Chunk   ILSVRC datapath h...
382
            with self.table.batch(batch_size=2000) as b:
84648488   Chunk   reverted.
383
                for imgname, imgdata in dict_databuf.items():
cb798a7f   Chunk   libs & scripts in...
384
385
386
387
388
389
390
391
392
                    b.put(imgname, {'cf_pic:data': imgdata})
        except ValueError:
            raise


    def store_info(self, infotype='all'):
        if self.table == None:
            self.table = self.get_table()

080c30c2   Chunk   F5 lib updated. I...
393
        dict_infobuf = {}
cb798a7f   Chunk   libs & scripts in...
394
395
396
397
398
399

        with open(self.list_file, 'rb') as tsvfile:
            tsvfile = csv.reader(tsvfile, delimiter='\t')
            for line in tsvfile:
                dict_infobuf[line[0] + '.jpg'] = line[1:-2]

84648488   Chunk   reverted.
400
        if infotype == 'all':
554a7b9a   Chunk   staged.
401
402
            try:
                with self.table.batch(batch_size=5000) as b:
cb798a7f   Chunk   libs & scripts in...
403
404
                    for imgname, imginfo in dict_infobuf.items():
                        b.put(imgname,
cb798a7f   Chunk   libs & scripts in...
405
                              {'cf_info:width': imginfo[0], 'cf_info:height': imginfo[1], 'cf_info:size': imginfo[2],
080c30c2   Chunk   F5 lib updated. I...
406
                               'cf_info:capacity': imginfo[3],
cb798a7f   Chunk   libs & scripts in...
407
                               'cf_info:quality': imginfo[4]})
84648488   Chunk   reverted.
408
            except ValueError:
080c30c2   Chunk   F5 lib updated. I...
409
                raise
2c2d57c7   Chunk   ILSVRC datapath h...
410
411
412
413
414
415
416
417
        else:
            raise Exception("Unknown infotype!")


    def store_tag(self, tagtype='all'):
        if self.table == None:
            self.table = self.get_table()

080c30c2   Chunk   F5 lib updated. I...
418
        dict_tagbuf = {}
2c2d57c7   Chunk   ILSVRC datapath h...
419

080c30c2   Chunk   F5 lib updated. I...
420
421
422
423
424
425
426
        with open(self.list_file, 'rb') as tsvfile:
            tsvfile = csv.reader(tsvfile, delimiter='\t')
            for line in tsvfile:
                dict_tagbuf[line[0] + '.jpg'] = line[-2:]

        if tagtype == 'all':
            try:
080c30c2   Chunk   F5 lib updated. I...
427
428
                with self.table.batch(batch_size=5000) as b:
                    for imgname, imgtag in dict_tagbuf.items():
2c2d57c7   Chunk   ILSVRC datapath h...
429
                        b.put(imgname, {'cf_tag:chosen': imgtag[0], 'cf_tag:class': imgtag[1]})
84648488   Chunk   reverted.
430
            except ValueError:
2c2d57c7   Chunk   ILSVRC datapath h...
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
                raise
        else:
            raise Exception("Unknown tagtype!")


    def store_feat(self, feattype='ibd'):
        if self.table == None:
            self.table = self.get_table()

        dict_featbuf = {}
        for path, subdirs, files in os.walk(self.feat_dir):
            for name in files:
                featpath = os.path.join(path, name)
                # print featpath
                with open(featpath, 'rb') as featfile:
                    imgname = path.split('/')[-1] + name.replace('.' + feattype, '.jpg')
                    dict_featbuf[imgname] = featfile.read()

        try:
            with self.table.batch(batch_size=5000) as b:
                for imgname, featdesc in dict_featbuf.items():
84648488   Chunk   reverted.
452
                    b.put(imgname, {'cf_feat:' + feattype: featdesc})
bde8352b   Chunk   shuffling.
453
        except ValueError:
f1fa5b17   Chunk   review & streaming.
454
            raise
2c2d57c7   Chunk   ILSVRC datapath h...
455
456
457
458
459
460
            pass


    def load_data(self, mode='local', feattype='ibd', tagtype='class', shuffle=False):
        print "loading data..."
        INDEX = []
d0be60e7   Chunk   jpeg update.
461
462
        X = []
        Y = []
ec755e37   Chunk   cropping.
463
464
465
466
467
468

        if mode == "local":

            dict_dataset = {}

            if feattype == 'coef':  # raw
bbd2f705   Chunk   cropping.
469
                with open(self.list_file, 'rb') as tsvfile:
ec755e37   Chunk   cropping.
470
471
                    tsvfile = csv.reader(tsvfile, delimiter='\t')
                    for line in tsvfile:
b9990e77   Chunk   staged.
472
473
474
                        hash = line[0]
                        tag = line[-1]
                        image = os.path.join(self.img_dir, hash[:3], hash[3:] + '.jpg')
84648488   Chunk   reverted.
475
                        if image:
bde8352b   Chunk   shuffling.
476
477
                            im = Jpeg(image, key=sample_key)
                            dict_dataset[hash] = (tag, im.getCoefMatrix(channel='Y'))
e6be6b61   Chunk   import caffe.
478

b9990e77   Chunk   staged.
479
                for tag, feat in dict_dataset.values():
ec755e37   Chunk   cropping.
480
481
482
483
484
485
486
487
488
489
490
                    feat.ravel()[[i * 200 + j for i in range(0, 200, 8) for j in range(0, 200, 8)]] = 0
                    feat = np.absolute(feat)
                    feat = np.bitwise_and(feat, 1)
                    X.append(feat.ravel())
                    Y.append(int(tag))

            else:
                with open(self.list_file, 'rb') as tsvfile:
                    tsvfile = csv.reader(tsvfile, delimiter='\t')
                    for line in tsvfile:
                        hash = line[0]
d0be60e7   Chunk   jpeg update.
491
                        tag = line[-1]
b9990e77   Chunk   staged.
492
493
494
495
                        path_feat = os.path.join(self.feat_dir, hash[:3], hash[3:] + '.' + feattype)
                        if path_feat:
                            with open(path_feat, 'rb') as featfile:
                                dict_dataset[hash] = (tag, json.loads(featfile.read()))
2c2d57c7   Chunk   ILSVRC datapath h...
496

ec755e37   Chunk   cropping.
497
                for tag, feat in dict_dataset.values():
2c2d57c7   Chunk   ILSVRC datapath h...
498
499
500
501
502
                    # X.append([item for sublist in feat for subsublist in sublist for item in subsublist])
                    X.append(np.array(feat).ravel().tolist())
                    Y.append(int(tag))

        elif mode == "hbase":  # remote
ec755e37   Chunk   cropping.
503
            if self.table == None:
84648488   Chunk   reverted.
504
                self.table = self.get_table()
02528074   Chunk   staged.
505

2c2d57c7   Chunk   ILSVRC datapath h...
506
            col_feat, col_tag = 'cf_feat:' + feattype, 'cf_tag:' + tagtype
ec755e37   Chunk   cropping.
507
            for key, data in self.table.scan(columns=[col_feat, col_tag]):
02528074   Chunk   staged.
508
                X.append(
84648488   Chunk   reverted.
509
                    [item for sublist in json.loads(data[col_feat]) for subsublist in sublist for item in subsublist])
2c2d57c7   Chunk   ILSVRC datapath h...
510
                Y.append(int(data[col_tag]))
02528074   Chunk   staged.
511

2c2d57c7   Chunk   ILSVRC datapath h...
512
513
514
        elif mode == "spark":  # cluster
            if self.sparker == None:
                self.sparker = SC.Sparker(host=self.host, appname=self.appname, master=self.master)
cb798a7f   Chunk   libs & scripts in...
515

2c2d57c7   Chunk   ILSVRC datapath h...
516
517
518
            result = self.sparker.read_hbase(self.table_name)  # result = {key:[feat,tag],...}
            for feat, tag in result:
                X.append(feat)
bde8352b   Chunk   shuffling.
519
520
521
522
523
                Y.append(tag)

        else:
            raise Exception("Unknown mode!")

2c2d57c7   Chunk   ILSVRC datapath h...
524
        if shuffle:
bde8352b   Chunk   shuffling.
525
            # shuffling
84648488   Chunk   reverted.
526
527
528
529
530
            Z = zip(X, Y)
            np.random.shuffle(Z)
            return Z

        return X, Y