Blame view

mdata/ILSVRC.py 10.2 KB
2c2d57c7   Chunk   ILSVRC datapath h...
1
2
3
__author__ = 'chunk'

from . import *
84648488   Chunk   reverted.
4
from ..mfeat import HOG, IntraBlockDiff
2c2d57c7   Chunk   ILSVRC datapath h...
5
6
7
8
9
10
11
12
13
14
15
16
17
18
from ..mspark import SC
from ..common import *

import os, sys
from PIL import Image
from hashlib import md5
import csv
import shutil
import json
import collections
import happybase

from ..mjpeg import *
from ..msteg import *
9ff70cf4   Chunk   capacity engeneer...
19

2c2d57c7   Chunk   ILSVRC datapath h...
20
import os
d1042d03   Chunk   staged.
21
22
23
24
import numpy as np
from numpy.random import randn
import pandas as pd
from scipy import stats
ec755e37   Chunk   cropping.
25

d1042d03   Chunk   staged.
26

080c30c2   Chunk   F5 lib updated. I...
27
28
np.random.seed(sum(map(ord, "whoami")))

84648488   Chunk   reverted.
29

d1042d03   Chunk   staged.
30
31
class DataILSVRC(DataDumperBase):
    def __init__(self, base_dir='/media/chunk/Elements/D/data/ImageNet/img/ILSVRC2013_DET_val', category='Train'):
080c30c2   Chunk   F5 lib updated. I...
32
33
        DataDumperBase.__init__(self, base_dir, category)

2c2d57c7   Chunk   ILSVRC datapath h...
34
35
        self.base_dir = base_dir
        self.category = category
84648488   Chunk   reverted.
36
        self.data_dir = os.path.join(self.base_dir, self.category)
2c2d57c7   Chunk   ILSVRC datapath h...
37
38
39
40
41
42
43
44
45
46
47
48
49
50

        self.dst_dir = os.path.join(self.base_dir, 'dst', self.category)
        self.list_file = os.path.join(self.dst_dir, 'file-tag.tsv')
        self.feat_dir = os.path.join(self.dst_dir, 'Feat')
        self.img_dir = os.path.join(self.dst_dir, 'Img')

        self.dict_data = {}

        self.table_name = self.base_dir.strip('/').split('/')[-1] + '-' + self.category
        self.sparkcontex = None

    def format(self):
        self.extract()

02528074   Chunk   staged.
51
    def _hash_copy(self, image):
2c2d57c7   Chunk   ILSVRC datapath h...
52
53
        if not image.endswith('jpg'):
            img = Image.open(image)
f1fa5b17   Chunk   review & streaming.
54
            img.save('res/tmp.jpg', format='JPEG')
2c2d57c7   Chunk   ILSVRC datapath h...
55
56
57
58
59
            image = 'res/tmp.jpg'

        with open(image, 'rb') as f:
            index = md5(f.read()).hexdigest()

080c30c2   Chunk   F5 lib updated. I...
60
61
        im = Jpeg(image, key=sample_key)
        self.dict_data[index] = [im.image_width, im.image_height, os.path.getsize(image), im.getQuality()]
2c2d57c7   Chunk   ILSVRC datapath h...
62
63
64
65
66

        # origion:
        # dir = base_dir + 'Img/Train/' + index[:3]
        dir = os.path.join(self.img_dir, index[:3])
        if not os.path.exists(dir):
84648488   Chunk   reverted.
67
            os.makedirs(dir)
9ff70cf4   Chunk   capacity engeneer...
68
        image_path = os.path.join(dir, index[3:] + '.jpg')
d0be60e7   Chunk   jpeg update.
69
70
        # print image_path

2c2d57c7   Chunk   ILSVRC datapath h...
71
72
        if not os.path.exists(image_path):
            shutil.copy(image, image_path)
1dc7c44b   Chunk   crawler-hbase-spa...
73
        else:
2c2d57c7   Chunk   ILSVRC datapath h...
74
75
76
77
78
79
80
81
82
83
84
            pass

    def _build_list(self):
        assert self.list_file != None

        ordict_img = collections.OrderedDict(sorted(self.dict_data.items(), key=lambda d: d[0]))

        with open(self.list_file, 'w') as f:
            tsvfile = csv.writer(f, delimiter='\t')
            for key, value in ordict_img.items():
                tsvfile.writerow([key] + value)
554a7b9a   Chunk   staged.
85

84648488   Chunk   reverted.
86
87
88
89
90
    def _anaylis(self):
        df_ILS = pd.read_csv(self.list_file, names=['hash', 'width', 'height', 'size', 'quality'], sep='\t')
        length = df_ILS.shape[0]
        df_new = df_ILS.sort(['size', 'quality'], ascending=True)
        rand_class = stats.bernoulli.rvs(0.3, size=length)
554a7b9a   Chunk   staged.
91
92
93
94
95
96
97
98

        df_new['class'] = pd.Series(rand_class, index=df_new.index)
        df_new.to_csv(self.list_file, header=False, index=False, sep='\t')


    def extract(self):
        for path, subdirs, files in os.walk(self.data_dir):
            for name in files:
84648488   Chunk   reverted.
99
                imagepath = os.path.join(path, name)
554a7b9a   Chunk   staged.
100
                # print imagepath
f1fa5b17   Chunk   review & streaming.
101
                try:
84648488   Chunk   reverted.
102
103
104
                    self._hash_copy(imagepath)
                except:
                    pass
554a7b9a   Chunk   staged.
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130

        self._build_list()
        self._anaylis()

    def get_table(self):
        if self.table != None:
            return self.table

        if self.connection is None:
            c = happybase.Connection('HPC-server')
            self.connection = c

        tables = self.connection.tables()
        if self.table_name not in tables:
            families = {'cf_pic': dict(),
                        'cf_info': dict(max_versions=10),
                        'cf_tag': dict(),
                        'cf_feat': dict(),
                        }
            self.connection.create_table(name=self.table_name, families=families)

        table = self.connection.table(name=self.table_name)

        self.table = table

        return table
080c30c2   Chunk   F5 lib updated. I...
131
132
133
134


    def store_image(self):
        if self.table == None:
2c2d57c7   Chunk   ILSVRC datapath h...
135
136
137
            self.table = self.get_table()

        dict_databuf = {}
080c30c2   Chunk   F5 lib updated. I...
138

2c2d57c7   Chunk   ILSVRC datapath h...
139
140
141
142
        with open(self.list_file, 'rb') as tsvfile:
            tsvfile = csv.reader(tsvfile, delimiter='\t')
            for line in tsvfile:
                path_img = os.path.join(self.img_dir, line[0][:3], line[0][3:] + '.jpg')
080c30c2   Chunk   F5 lib updated. I...
143
144
145
146
147
                if path_img:
                    with open(path_img, 'rb') as fpic:
                        dict_databuf[line[0] + '.jpg'] = fpic.read()

        try:
84648488   Chunk   reverted.
148
            with self.table.batch(batch_size=5000) as b:
d1042d03   Chunk   staged.
149
                for imgname, imgdata in dict_databuf.items():
9ff70cf4   Chunk   capacity engeneer...
150
                    b.put(imgname, {'cf_pic:data': imgdata})
9371f8fa   Chunk   SVM param engenee...
151
        except ValueError:
d1042d03   Chunk   staged.
152
            raise
9ff70cf4   Chunk   capacity engeneer...
153
            pass
d0be60e7   Chunk   jpeg update.
154
155

    def store_info(self, infotype='all'):
d1042d03   Chunk   staged.
156
        if self.table == None:
d0be60e7   Chunk   jpeg update.
157
            self.table = self.get_table()
2c2d57c7   Chunk   ILSVRC datapath h...
158
159

        dict_infobuf = {}
f1fa5b17   Chunk   review & streaming.
160

2c2d57c7   Chunk   ILSVRC datapath h...
161
162
163
        with open(self.list_file, 'rb') as tsvfile:
            tsvfile = csv.reader(tsvfile, delimiter='\t')
            for line in tsvfile:
d1042d03   Chunk   staged.
164
165
166
167
168
                dict_infobuf[line[0] + '.jpg'] = line[2:-1]

        if infotype == 'all':
            try:
                with self.table.batch(batch_size=5000) as b:
2c2d57c7   Chunk   ILSVRC datapath h...
169
                    for imgname, imginfo in dict_infobuf.items():
d1042d03   Chunk   staged.
170
171
                        b.put(imgname,
                              {'cf_info:width': imginfo[0], 'cf_info:height': imginfo[1], 'cf_info:size': imginfo[2],
2c2d57c7   Chunk   ILSVRC datapath h...
172
                               'cf_info:quality': imginfo[3]})
84648488   Chunk   reverted.
173
            except ValueError:
9ff70cf4   Chunk   capacity engeneer...
174
                raise
080c30c2   Chunk   F5 lib updated. I...
175
176
177
                pass
        else:
            raise Exception("Unknown mode!")
d0be60e7   Chunk   jpeg update.
178

080c30c2   Chunk   F5 lib updated. I...
179

9ff70cf4   Chunk   capacity engeneer...
180
    def store_tag(self, tagtype='class'):
84648488   Chunk   reverted.
181
        if self.table == None:
080c30c2   Chunk   F5 lib updated. I...
182
            self.table = self.get_table()
d0be60e7   Chunk   jpeg update.
183

9ff70cf4   Chunk   capacity engeneer...
184
        dict_tagbuf = {}
84648488   Chunk   reverted.
185

d0be60e7   Chunk   jpeg update.
186
        with open(self.list_file, 'rb') as tsvfile:
080c30c2   Chunk   F5 lib updated. I...
187
188
            tsvfile = csv.reader(tsvfile, delimiter='\t')
            for line in tsvfile:
84648488   Chunk   reverted.
189
                dict_tagbuf[line[0] + '.jpg'] = line[-1]
9ff70cf4   Chunk   capacity engeneer...
190

d0be60e7   Chunk   jpeg update.
191
        try:
080c30c2   Chunk   F5 lib updated. I...
192
193
194
195
            with self.table.batch(batch_size=5000) as b:
                for imgname, imgtag in dict_tagbuf.items():
                    b.put(imgname, {'cf_tag:' + tagtype: imgtag})
        except ValueError:
9ff70cf4   Chunk   capacity engeneer...
196
            raise
080c30c2   Chunk   F5 lib updated. I...
197
198
199
200
201
            pass


    def get_feat(self, image, feattype='ibd', **kwargs):
        size = kwargs.get('size', (48, 48))
080c30c2   Chunk   F5 lib updated. I...
202
203

        if feattype == 'hog':
d0be60e7   Chunk   jpeg update.
204
            feater = HOG.FeatHOG(size=size)
84648488   Chunk   reverted.
205
        elif feattype == 'ibd':
d0be60e7   Chunk   jpeg update.
206
207
208
209
210
211
212
213
            feater = IntraBlockDiff.FeatIntraBlockDiff()
        else:
            raise Exception("Unknown feature type!")

        desc = feater.feat(image)

        return desc

080c30c2   Chunk   F5 lib updated. I...
214
    def extract_feat(self, feattype='ibd'):
9ff70cf4   Chunk   capacity engeneer...
215
216
217
218
219

        if feattype == 'hog':
            feater = HOG.FeatHOG(size=(48, 48))
        elif feattype == 'ibd':
            feater = IntraBlockDiff.FeatIntraBlockDiff()
84648488   Chunk   reverted.
220
        else:
9ff70cf4   Chunk   capacity engeneer...
221
222
223
224
225
226
227
228
229
            raise Exception("Unknown feature type!")

        list_image = []
        with open(self.list_file, 'rb') as tsvfile:
            tsvfile = csv.reader(tsvfile, delimiter='\t')
            for line in tsvfile:
                list_image.append(line[0])

        dict_featbuf = {}
84648488   Chunk   reverted.
230
        for imgname in list_image:
9ff70cf4   Chunk   capacity engeneer...
231
232
            # if imgtag == 'True':
            image = os.path.join(self.img_dir, imgname[:3], imgname[3:] + '.jpg')
9371f8fa   Chunk   SVM param engenee...
233
            desc = feater.feat(image)
9ff70cf4   Chunk   capacity engeneer...
234
235
236
237
238
239
            dict_featbuf[imgname] = desc

        for imgname, desc in dict_featbuf.items():
            # print imgname, desc
            dir = os.path.join(self.feat_dir, imgname[:3])
            if not os.path.exists(dir):
84648488   Chunk   reverted.
240
                os.makedirs(dir)
9ff70cf4   Chunk   capacity engeneer...
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
            featpath = os.path.join(dir, imgname[3:].split('.')[0] + '.' + feattype)
            with open(featpath, 'wb') as featfile:
                featfile.write(json.dumps(desc.tolist()))


    def store_feat(self, feattype='ibd'):
        if self.table == None:
            self.table = self.get_table()

        dict_featbuf = {}
        for path, subdirs, files in os.walk(self.feat_dir):
            for name in files:
                featpath = os.path.join(path, name)
                # print featpath
                with open(featpath, 'rb') as featfile:
                    imgname = path.split('/')[-1] + name.replace('.' + feattype, '.jpg')
                    dict_featbuf[imgname] = featfile.read()

84648488   Chunk   reverted.
259
        try:
9ff70cf4   Chunk   capacity engeneer...
260
261
262
263
264
265
266
267
268
269
270
271
            with self.table.batch(batch_size=5000) as b:
                for imgname, featdesc in dict_featbuf.items():
                    b.put(imgname, {'cf_feat:' + feattype: featdesc})
        except ValueError:
            raise
            pass

    def load_data(self, mode='local', feattype='ibd', tagtype='class'):
        INDEX = []
        X = []
        Y = []

554a7b9a   Chunk   staged.
272
        if mode == "local":
f1fa5b17   Chunk   review & streaming.
273

9371f8fa   Chunk   SVM param engenee...
274
            dict_tagbuf = {}
9ff70cf4   Chunk   capacity engeneer...
275
            with open(self.list_file, 'rb') as tsvfile:
84648488   Chunk   reverted.
276
                tsvfile = csv.reader(tsvfile, delimiter='\t')
ec755e37   Chunk   cropping.
277
                for line in tsvfile:
e6be6b61   Chunk   import caffe.
278
279
280
                    imgname = line[0] + '.jpg'
                    dict_tagbuf[imgname] = line[-1]

ec755e37   Chunk   cropping.
281
282
283
284
285
286
287
288
            dict_dataset = {}
            for path, subdirs, files in os.walk(self.feat_dir):
                for name in files:
                    featpath = os.path.join(path, name)
                    with open(featpath, 'rb') as featfile:
                        imgname = path.split('/')[-1] + name.replace('.' + feattype, '.jpg')
                        dict_dataset[imgname] = json.loads(featfile.read())

b9990e77   Chunk   staged.
289
            for imgname, tag in dict_tagbuf.items():
ec755e37   Chunk   cropping.
290
291
292
293
294
                tag = 1 if tag == 'True' else 0
                INDEX.append(imgname)
                X.append(dict_dataset[imgname])
                Y.append(tag)

e6be6b61   Chunk   import caffe.
295
        elif mode == "remote" or mode == "hbase":
ec755e37   Chunk   cropping.
296
297
298
299
300
301
            if self.table == None:
                self.table = self.get_table()

            col_feat, col_tag = 'cf_feat:' + feattype, 'cf_tag:' + tagtype
            for key, data in self.table.scan(columns=[col_feat, col_tag]):
                X.append(json.loads(data[col_feat]))
b9990e77   Chunk   staged.
302
303
                Y.append(1 if data[col_tag] == 'True' else 0)

25c0c9c9   Chunk   feat.ravel()[[i*3...
304
        elif mode == "spark" or mode == "cluster":
e6be6b61   Chunk   import caffe.
305
            if self.sparkcontex == None:
bde8352b   Chunk   shuffling.
306
307
                self.sparkcontex = SC.Sparker(host='HPC-server', appname='ImageCV', master='spark://HPC-server:7077')

ec755e37   Chunk   cropping.
308
            result = self.sparkcontex.read_habase(self.table_name)  # result = {key:[feat,tag],...}
bde8352b   Chunk   shuffling.
309
            for feat, tag in result:
ec755e37   Chunk   cropping.
310
311
                X.append(feat)
                Y.append(tag)
84648488   Chunk   reverted.
312

2c2d57c7   Chunk   ILSVRC datapath h...
313
        else:
f1fa5b17   Chunk   review & streaming.
314
            raise Exception("Unknown mode!")
2c2d57c7   Chunk   ILSVRC datapath h...
315
316

        return X, Y
f4fb4381   Chunk   staged.

2c2d57c7   Chunk   ILSVRC datapath h...

51708346   Chunk   final experiments...

2c2d57c7   Chunk   ILSVRC datapath h...

d47ae6ce   Chunk   staged.

f1fa5b17   Chunk   review & streaming.

d47ae6ce   Chunk   staged.

2c2d57c7   Chunk   ILSVRC datapath h...

ad70caf6   Chunk   staged.

2c2d57c7   Chunk   ILSVRC datapath h...

24768a99   Chunk   mode 'hbase' fini...

2c2d57c7   Chunk   ILSVRC datapath h...

489c5608   Chunk   debugging...

2c2d57c7   Chunk   ILSVRC datapath h...

2c2d57c7   Chunk   ILSVRC datapath h...

84648488   Chunk   reverted.

cb798a7f   Chunk   libs & scripts in...

080c30c2   Chunk   F5 lib updated. I...

cb798a7f   Chunk   libs & scripts in...

84648488   Chunk   reverted.

554a7b9a   Chunk   staged.

cb798a7f   Chunk   libs & scripts in...

cb798a7f   Chunk   libs & scripts in...

080c30c2   Chunk   F5 lib updated. I...

cb798a7f   Chunk   libs & scripts in...

84648488   Chunk   reverted.

080c30c2   Chunk   F5 lib updated. I...

2c2d57c7   Chunk   ILSVRC datapath h...

080c30c2   Chunk   F5 lib updated. I...

2c2d57c7   Chunk   ILSVRC datapath h...

080c30c2   Chunk   F5 lib updated. I...

080c30c2   Chunk   F5 lib updated. I...

2c2d57c7   Chunk   ILSVRC datapath h...

84648488   Chunk   reverted.

2c2d57c7   Chunk   ILSVRC datapath h...

84648488   Chunk   reverted.

bde8352b   Chunk   shuffling.

f1fa5b17   Chunk   review & streaming.

2c2d57c7   Chunk   ILSVRC datapath h...

d0be60e7   Chunk   jpeg update.

ec755e37   Chunk   cropping.

bbd2f705   Chunk   cropping.

ec755e37   Chunk   cropping.

b9990e77   Chunk   staged.

84648488   Chunk   reverted.

bde8352b   Chunk   shuffling.

e6be6b61   Chunk   import caffe.

b9990e77   Chunk   staged.

ec755e37   Chunk   cropping.

d0be60e7   Chunk   jpeg update.

b9990e77   Chunk   staged.

2c2d57c7   Chunk   ILSVRC datapath h...

ec755e37   Chunk   cropping.

2c2d57c7   Chunk   ILSVRC datapath h...

ec755e37   Chunk   cropping.

84648488   Chunk   reverted.

02528074   Chunk   staged.

2c2d57c7   Chunk   ILSVRC datapath h...

ec755e37   Chunk   cropping.

02528074   Chunk   staged.

84648488   Chunk   reverted.

2c2d57c7   Chunk   ILSVRC datapath h...

02528074   Chunk   staged.

2c2d57c7   Chunk   ILSVRC datapath h...

cb798a7f   Chunk   libs & scripts in...

2c2d57c7   Chunk   ILSVRC datapath h...

bde8352b   Chunk   shuffling.

2c2d57c7   Chunk   ILSVRC datapath h...

bde8352b   Chunk   shuffling.

84648488   Chunk   reverted.