Blame view

mdata/ILSVRC.py 12.4 KB
2c2d57c7   Chunk   ILSVRC datapath h...
1
2
3
__author__ = 'chunk'

from . import *
84648488   Chunk   reverted.
4
from ..mfeat import HOG, IntraBlockDiff
2c2d57c7   Chunk   ILSVRC datapath h...
5
6
7
8
9
10
11
12
13
14
15
16
17
18
from ..mspark import SC
from ..common import *

import os, sys
from PIL import Image
from hashlib import md5
import csv
import shutil
import json
import collections
import happybase

from ..mjpeg import *
from ..msteg import *
9ff70cf4   Chunk   capacity engeneer...
19

2c2d57c7   Chunk   ILSVRC datapath h...
20
import numpy as np
d1042d03   Chunk   staged.
21
22
23
24
from numpy.random import randn
import pandas as pd
from scipy import stats

ec755e37   Chunk   cropping.
25
from subprocess import Popen, PIPE, STDOUT
d1042d03   Chunk   staged.
26

080c30c2   Chunk   F5 lib updated. I...
27
28

np.random.seed(sum(map(ord, "whoami")))
84648488   Chunk   reverted.
29

d1042d03   Chunk   staged.
30
31
package_dir = os.path.dirname(os.path.abspath(__file__))

080c30c2   Chunk   F5 lib updated. I...
32
33

class DataILSVRC(DataDumperBase):
2c2d57c7   Chunk   ILSVRC datapath h...
34
35
    def __init__(self, base_dir='/media/chunk/Elements/D/data/ImageNet/img/ILSVRC2013_DET_val', category='Train'):
        DataDumperBase.__init__(self, base_dir, category)
84648488   Chunk   reverted.
36

2c2d57c7   Chunk   ILSVRC datapath h...
37
38
39
40
41
42
43
44
45
46
47
48
49
50
        self.base_dir = base_dir
        self.category = category
        self.data_dir = os.path.join(self.base_dir, self.category)

        self.dst_dir = os.path.join(self.base_dir, 'dst', self.category)
        self.list_file = os.path.join(self.dst_dir, 'file-tag.tsv')
        self.feat_dir = os.path.join(self.dst_dir, 'Feat')
        self.img_dir = os.path.join(self.dst_dir, 'Img')

        self.dict_data = {}

        self.table_name = self.base_dir.strip('/').split('/')[-1] + '-' + self.category
        self.sparkcontex = None

02528074   Chunk   staged.
51
    def format(self):
2c2d57c7   Chunk   ILSVRC datapath h...
52
53
        self.extract()

f1fa5b17   Chunk   review & streaming.
54
    def _hash_copy(self, image):
2c2d57c7   Chunk   ILSVRC datapath h...
55
56
57
58
59
        if not image.endswith('jpg'):
            img = Image.open(image)
            img.save('../res/tmp.jpg', format='JPEG')
            image = '../res/tmp.jpg'

080c30c2   Chunk   F5 lib updated. I...
60
61
        with open(image, 'rb') as f:
            index = md5(f.read()).hexdigest()
2c2d57c7   Chunk   ILSVRC datapath h...
62
63
64
65
66

        im = Jpeg(image, key=sample_key)
        self.dict_data[index] = [im.image_width, im.image_height, im.image_width * im.image_height, im.getCapacity(), im.getQuality()]

        # self.dict_data[index] = [im.image_width, im.image_height, os.path.getsize(image), im.getQuality()]
84648488   Chunk   reverted.
67

9ff70cf4   Chunk   capacity engeneer...
68
        # origion:
d0be60e7   Chunk   jpeg update.
69
70
        # dir = base_dir + 'Img/Train/' + index[:3]
        dir = os.path.join(self.img_dir, index[:3])
2c2d57c7   Chunk   ILSVRC datapath h...
71
72
        if not os.path.exists(dir):
            os.makedirs(dir)
1dc7c44b   Chunk   crawler-hbase-spa...
73
        image_path = os.path.join(dir, index[3:] + '.jpg')
2c2d57c7   Chunk   ILSVRC datapath h...
74
75
76
77
78
79
80
81
82
83
84
        # print image_path

        if not os.path.exists(image_path):
            shutil.copy(image, image_path)
        else:
            pass

    def _build_list(self, list_file=None):
        if list_file == None:
            list_file = self.list_file
        assert list_file != None
554a7b9a   Chunk   staged.
85

84648488   Chunk   reverted.
86
87
88
89
90
        ordict_img = collections.OrderedDict(sorted(self.dict_data.items(), key=lambda d: d[0]))

        with open(list_file, 'w') as f:
            tsvfile = csv.writer(f, delimiter='\t')
            for key, value in ordict_img.items():
554a7b9a   Chunk   staged.
91
92
93
94
95
96
97
98
                tsvfile.writerow([key] + value)

    def _anaylis(self, list_file=None):
        if list_file == None:
            list_file = self.list_file
        assert list_file != None

        df_ILS = pd.read_csv(list_file, names=['hash', 'width', 'height', 'size', 'capacity','quality'], sep='\t')
84648488   Chunk   reverted.
99
        length = df_ILS.shape[0]
554a7b9a   Chunk   staged.
100
        df_ILS = df_ILS.sort(['size', 'quality'], ascending=True)
f1fa5b17   Chunk   review & streaming.
101
        rand_class = stats.bernoulli.rvs(0.3, size=length)
84648488   Chunk   reverted.
102
103
104

        df_ILS['chosen'] = rand_class
        df_ILS['class'] = np.zeros(length, np.int32)
554a7b9a   Chunk   staged.
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130

        df_ILS.to_csv(list_file, header=False, index=False, sep='\t')

    def extract(self):
        for path, subdirs, files in os.walk(self.data_dir):
            for name in files:
                imagepath = os.path.join(path, name)
                # print imagepath
                try:
                    self._hash_copy(imagepath)
                except:
                    pass

        self._build_list()
        self._anaylis()


    def embed(self):
        self.dict_data = {}
        dict_embedresult = {}
        os.environ["CLASSPATH"] = os.path.join(package_dir, "../libs/F5/")
        cmd = 'java Embed %s %s -e %s  -p password -c "stegan by chunk  " -q %d'

        df_ILS = pd.read_csv(self.list_file, names=['hash', 'width', 'height', 'size', 'quality', 'chosen', 'class'],
                             sep='\t')
        df_ILS_TARGET = df_ILS[df_ILS['chosen'] == 1]
080c30c2   Chunk   F5 lib updated. I...
131
132
133
134
        for hash, size, quality in zip(df_ILS_TARGET['hash'], df_ILS_TARGET['size'], df_ILS_TARGET['quality']):
            path_img = os.path.join(self.img_dir, hash[:3], hash[3:] + '.jpg')
            if path_img:
                print path_img
2c2d57c7   Chunk   ILSVRC datapath h...
135
136
137
                p = Popen(cmd % (path_img, 'res/tmp.jpg', quality), shell=True, stdout=PIPE, stderr=STDOUT)
                dict_embedresult[hash] = [line.strip('\n') for line in p.stdout.readlines()]
                try:
080c30c2   Chunk   F5 lib updated. I...
138
                    self._hash_copy('res/tmp.jpg')
2c2d57c7   Chunk   ILSVRC datapath h...
139
140
141
142
                except:
                    pass
        with open(self.list_file + '.embed.log', 'wb') as f:
            tsvfile = csv.writer(f, delimiter='\t')
080c30c2   Chunk   F5 lib updated. I...
143
144
145
146
147
            for key, value in dict_embedresult.items():
                tsvfile.writerow([key] + value)

        self._build_list(self.list_file + '.embed')

84648488   Chunk   reverted.
148
        # merge
d1042d03   Chunk   staged.
149
        df_ILS_EMBED = pd.read_csv(self.list_file + '.embed', names=['hash', 'width', 'height', 'size', 'quality'],
9ff70cf4   Chunk   capacity engeneer...
150
                                   sep='\t')
9371f8fa   Chunk   SVM param engenee...
151
        length = df_ILS_EMBED.shape[0]
d1042d03   Chunk   staged.
152
        df_ILS_EMBED = df_ILS_EMBED.sort(['size', 'quality'], ascending=True)
9ff70cf4   Chunk   capacity engeneer...
153
        df_ILS_EMBED['chosen'] = np.zeros(length, np.int32)
d0be60e7   Chunk   jpeg update.
154
155
        df_ILS_EMBED['class'] = np.ones(length, np.int32)

d1042d03   Chunk   staged.
156
        df_ILS = df_ILS.append(df_ILS_EMBED, ignore_index=True)
d0be60e7   Chunk   jpeg update.
157
        df_ILS.to_csv(self.list_file, header=False, index=False, sep='\t')
2c2d57c7   Chunk   ILSVRC datapath h...
158
159

    def get_table(self):
f1fa5b17   Chunk   review & streaming.
160
        if self.table != None:
2c2d57c7   Chunk   ILSVRC datapath h...
161
162
163
            return self.table

        if self.connection is None:
d1042d03   Chunk   staged.
164
165
166
167
168
            c = happybase.Connection('HPC-server')
            self.connection = c

        tables = self.connection.tables()
        if self.table_name not in tables:
2c2d57c7   Chunk   ILSVRC datapath h...
169
            families = {'cf_pic': dict(),
d1042d03   Chunk   staged.
170
171
                        'cf_info': dict(max_versions=10),
                        'cf_tag': dict(),
2c2d57c7   Chunk   ILSVRC datapath h...
172
                        'cf_feat': dict(),
84648488   Chunk   reverted.
173
                        }
9ff70cf4   Chunk   capacity engeneer...
174
            self.connection.create_table(name=self.table_name, families=families)
080c30c2   Chunk   F5 lib updated. I...
175
176
177

        table = self.connection.table(name=self.table_name)

d0be60e7   Chunk   jpeg update.
178
        self.table = table
080c30c2   Chunk   F5 lib updated. I...
179

9ff70cf4   Chunk   capacity engeneer...
180
        return table
84648488   Chunk   reverted.
181

080c30c2   Chunk   F5 lib updated. I...
182

d0be60e7   Chunk   jpeg update.
183
    def store_image(self):
9ff70cf4   Chunk   capacity engeneer...
184
        if self.table == None:
84648488   Chunk   reverted.
185
            self.table = self.get_table()
d0be60e7   Chunk   jpeg update.
186

080c30c2   Chunk   F5 lib updated. I...
187
188
        dict_databuf = {}

84648488   Chunk   reverted.
189
        with open(self.list_file, 'rb') as tsvfile:
9ff70cf4   Chunk   capacity engeneer...
190
            tsvfile = csv.reader(tsvfile, delimiter='\t')
d0be60e7   Chunk   jpeg update.
191
            for line in tsvfile:
080c30c2   Chunk   F5 lib updated. I...
192
193
194
195
                path_img = os.path.join(self.img_dir, line[0][:3], line[0][3:] + '.jpg')
                if path_img:
                    with open(path_img, 'rb') as fpic:
                        dict_databuf[line[0] + '.jpg'] = fpic.read()
9ff70cf4   Chunk   capacity engeneer...
196

080c30c2   Chunk   F5 lib updated. I...
197
198
199
200
201
        try:
            with self.table.batch(batch_size=5000) as b:
                for imgname, imgdata in dict_databuf.items():
                    b.put(imgname, {'cf_pic:data': imgdata})
        except ValueError:
080c30c2   Chunk   F5 lib updated. I...
202
203
            raise
            pass
d0be60e7   Chunk   jpeg update.
204

84648488   Chunk   reverted.
205

d0be60e7   Chunk   jpeg update.
206
207
208
209
210
211
212
213
    def store_info(self, infotype='all'):
        if self.table == None:
            self.table = self.get_table()

        dict_infobuf = {}

        with open(self.list_file, 'rb') as tsvfile:
            tsvfile = csv.reader(tsvfile, delimiter='\t')
080c30c2   Chunk   F5 lib updated. I...
214
            for line in tsvfile:
9ff70cf4   Chunk   capacity engeneer...
215
216
217
218
219
                dict_infobuf[line[0] + '.jpg'] = line[1:-2]

        if infotype == 'all':
            try:
                with self.table.batch(batch_size=5000) as b:
84648488   Chunk   reverted.
220
                    for imgname, imginfo in dict_infobuf.items():
9ff70cf4   Chunk   capacity engeneer...
221
222
223
224
225
226
227
228
229
                        b.put(imgname,
                              {'cf_info:width': imginfo[0], 'cf_info:height': imginfo[1], 'cf_info:size': imginfo[2],
                               'cf_info:quality': imginfo[3]})
            except ValueError:
                raise
                pass
        else:
            raise Exception("Unknown infotype!")

84648488   Chunk   reverted.
230

9ff70cf4   Chunk   capacity engeneer...
231
232
    def store_tag(self, tagtype='all'):
        if self.table == None:
9371f8fa   Chunk   SVM param engenee...
233
            self.table = self.get_table()
9ff70cf4   Chunk   capacity engeneer...
234
235
236
237
238
239

        dict_tagbuf = {}

        with open(self.list_file, 'rb') as tsvfile:
            tsvfile = csv.reader(tsvfile, delimiter='\t')
            for line in tsvfile:
84648488   Chunk   reverted.
240
                dict_tagbuf[line[0] + '.jpg'] = line[-2:]
9ff70cf4   Chunk   capacity engeneer...
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258

        if tagtype == 'all':
            try:
                with self.table.batch(batch_size=5000) as b:
                    for imgname, imgtag in dict_tagbuf.items():
                        b.put(imgname, {'cf_tag:chosen': imgtag[0], 'cf_tag:class': imgtag[1]})
            except ValueError:
                raise
                pass
        else:
            raise Exception("Unknown tagtype!")


    def get_feat(self, image, feattype='ibd', **kwargs):
        size = kwargs.get('size', (48, 48))

        if feattype == 'hog':
            feater = HOG.FeatHOG(size=size)
84648488   Chunk   reverted.
259
        elif feattype == 'ibd':
9ff70cf4   Chunk   capacity engeneer...
260
261
262
263
264
265
266
267
268
269
270
271
            feater = IntraBlockDiff.FeatIntraBlockDiff()
        else:
            raise Exception("Unknown feature type!")

        desc = feater.feat(image)

        return desc


    def extract_feat(self, feattype='ibd'):
        if feattype == 'hog':
            feater = HOG.FeatHOG(size=(48, 48))
554a7b9a   Chunk   staged.
272
        elif feattype == 'ibd':
f1fa5b17   Chunk   review & streaming.
273
            feater = IntraBlockDiff.FeatIntraBlockDiff()
9371f8fa   Chunk   SVM param engenee...
274
        else:
9ff70cf4   Chunk   capacity engeneer...
275
            raise Exception("Unknown feature type!")
84648488   Chunk   reverted.
276

ec755e37   Chunk   cropping.
277
        list_image = []
e6be6b61   Chunk   import caffe.
278
279
280
        with open(self.list_file, 'rb') as tsvfile:
            tsvfile = csv.reader(tsvfile, delimiter='\t')
            for line in tsvfile:
ec755e37   Chunk   cropping.
281
282
283
284
285
286
287
288
                list_image.append(line[0])

        dict_featbuf = {}
        for imgname in list_image:
            # if imgtag == 'True':
            image = os.path.join(self.img_dir, imgname[:3], imgname[3:] + '.jpg')
            desc = feater.feat(image)
            dict_featbuf[imgname] = desc
b9990e77   Chunk   staged.
289

ec755e37   Chunk   cropping.
290
291
292
293
294
        for imgname, desc in dict_featbuf.items():
            # print imgname, desc
            dir = os.path.join(self.feat_dir, imgname[:3])
            if not os.path.exists(dir):
                os.makedirs(dir)
e6be6b61   Chunk   import caffe.
295
            featpath = os.path.join(dir, imgname[3:].split('.')[0] + '.' + feattype)
ec755e37   Chunk   cropping.
296
297
298
299
300
301
            with open(featpath, 'wb') as featfile:
                featfile.write(json.dumps(desc.tolist()))


    def store_feat(self, feattype='ibd'):
        if self.table == None:
b9990e77   Chunk   staged.
302
303
            self.table = self.get_table()

25c0c9c9   Chunk   feat.ravel()[[i*3...
304
        dict_featbuf = {}
e6be6b61   Chunk   import caffe.
305
        for path, subdirs, files in os.walk(self.feat_dir):
bde8352b   Chunk   shuffling.
306
307
            for name in files:
                featpath = os.path.join(path, name)
ec755e37   Chunk   cropping.
308
                # print featpath
bde8352b   Chunk   shuffling.
309
                with open(featpath, 'rb') as featfile:
ec755e37   Chunk   cropping.
310
311
                    imgname = path.split('/')[-1] + name.replace('.' + feattype, '.jpg')
                    dict_featbuf[imgname] = featfile.read()
84648488   Chunk   reverted.
312

2c2d57c7   Chunk   ILSVRC datapath h...
313
        try:
f1fa5b17   Chunk   review & streaming.
314
            with self.table.batch(batch_size=5000) as b:
2c2d57c7   Chunk   ILSVRC datapath h...
315
316
317
318
319
320
321
322
323
                for imgname, featdesc in dict_featbuf.items():
                    b.put(imgname, {'cf_feat:' + feattype: featdesc})
        except ValueError:
            raise
            pass


    def load_data(self, mode='local', feattype='ibd', tagtype='class'):
        INDEX = []
f4fb4381   Chunk   staged.
324
325
326
327
328
        X = []
        Y = []

        if mode == "local":

2c2d57c7   Chunk   ILSVRC datapath h...
329
330
331
332
333
            dict_dataset = {}

            with open(self.list_file, 'rb') as tsvfile:
                tsvfile = csv.reader(tsvfile, delimiter='\t')
                for line in tsvfile:
51708346   Chunk   final experiments...
334
                    hash = line[0]
2c2d57c7   Chunk   ILSVRC datapath h...
335
336
337
338
339
340
341
                    tag = line[-1]
                    path_feat = os.path.join(self.feat_dir, hash[:3], hash[3:] + '.' + feattype)
                    if path_feat:
                        with open(path_feat, 'rb') as featfile:
                            dict_dataset[hash] = (tag, json.loads(featfile.read()))

            for tag, feat in dict_dataset.values():
d47ae6ce   Chunk   staged.
342
                X.append([item for sublist in feat for subsublist in sublist for item in subsublist])
f1fa5b17   Chunk   review & streaming.
343
                Y.append(float(tag))
d47ae6ce   Chunk   staged.
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360

        elif mode == "remote" or mode == "hbase":
            if self.table == None:
                self.table = self.get_table()

            col_feat, col_tag = 'cf_feat:' + feattype, 'cf_tag:' + tagtype
            for key, data in self.table.scan(columns=[col_feat, col_tag]):
                X.append(json.loads(data[col_feat]))
                Y.append(1 if data[col_tag] == 'True' else 0)

        elif mode == "spark" or mode == "cluster":
            if self.sparkcontex == None:
                self.sparkcontex = SC.Sparker(host='HPC-server', appname='ImageCV', master='spark://HPC-server:7077')

            result = self.sparkcontex.read_habase(self.table_name)  # result = {key:[feat,tag],...}
            for feat, tag in result:
                X.append(feat)
2c2d57c7   Chunk   ILSVRC datapath h...
361
                Y.append(tag)
ad70caf6   Chunk   staged.
362

2c2d57c7   Chunk   ILSVRC datapath h...
363
364
365
366
        else:
            raise Exception("Unknown mode!")

        return X, Y
24768a99   Chunk   mode 'hbase' fini...

2c2d57c7   Chunk   ILSVRC datapath h...

489c5608   Chunk   debugging...

2c2d57c7   Chunk   ILSVRC datapath h...

2c2d57c7   Chunk   ILSVRC datapath h...

84648488   Chunk   reverted.

cb798a7f   Chunk   libs & scripts in...

080c30c2   Chunk   F5 lib updated. I...

cb798a7f   Chunk   libs & scripts in...

84648488   Chunk   reverted.

554a7b9a   Chunk   staged.

cb798a7f   Chunk   libs & scripts in...

cb798a7f   Chunk   libs & scripts in...

080c30c2   Chunk   F5 lib updated. I...

cb798a7f   Chunk   libs & scripts in...

84648488   Chunk   reverted.

080c30c2   Chunk   F5 lib updated. I...

2c2d57c7   Chunk   ILSVRC datapath h...

080c30c2   Chunk   F5 lib updated. I...

2c2d57c7   Chunk   ILSVRC datapath h...

080c30c2   Chunk   F5 lib updated. I...

080c30c2   Chunk   F5 lib updated. I...

2c2d57c7   Chunk   ILSVRC datapath h...

84648488   Chunk   reverted.

2c2d57c7   Chunk   ILSVRC datapath h...

84648488   Chunk   reverted.

bde8352b   Chunk   shuffling.

f1fa5b17   Chunk   review & streaming.

2c2d57c7   Chunk   ILSVRC datapath h...

d0be60e7   Chunk   jpeg update.

ec755e37   Chunk   cropping.

bbd2f705   Chunk   cropping.

ec755e37   Chunk   cropping.

b9990e77   Chunk   staged.

84648488   Chunk   reverted.

bde8352b   Chunk   shuffling.

e6be6b61   Chunk   import caffe.

b9990e77   Chunk   staged.

ec755e37   Chunk   cropping.

d0be60e7   Chunk   jpeg update.

b9990e77   Chunk   staged.

2c2d57c7   Chunk   ILSVRC datapath h...

ec755e37   Chunk   cropping.

2c2d57c7   Chunk   ILSVRC datapath h...

ec755e37   Chunk   cropping.

84648488   Chunk   reverted.

02528074   Chunk   staged.

2c2d57c7   Chunk   ILSVRC datapath h...

ec755e37   Chunk   cropping.

02528074   Chunk   staged.

84648488   Chunk   reverted.

2c2d57c7   Chunk   ILSVRC datapath h...

02528074   Chunk   staged.

2c2d57c7   Chunk   ILSVRC datapath h...

cb798a7f   Chunk   libs & scripts in...

2c2d57c7   Chunk   ILSVRC datapath h...

bde8352b   Chunk   shuffling.

2c2d57c7   Chunk   ILSVRC datapath h...

bde8352b   Chunk   shuffling.

84648488   Chunk   reverted.