Blame view

mdata/ILSVRC.py 8.73 KB
2c2d57c7   Chunk   ILSVRC datapath h...
1
2
3
__author__ = 'chunk'

from . import *
84648488   Chunk   reverted.
4
from ..mfeat import HOG, IntraBlockDiff
2c2d57c7   Chunk   ILSVRC datapath h...
5
6
7
8
9
10
11
12
13
14
15
16
17
18
from ..mspark import SC
from ..common import *

import os, sys
from PIL import Image
from hashlib import md5
import csv
import shutil
import json
import collections
import happybase

from ..mjpeg import *
from ..msteg import *
9ff70cf4   Chunk   capacity engeneer...
19

2c2d57c7   Chunk   ILSVRC datapath h...
20

d1042d03   Chunk   staged.
21
22
23
24
class DataILSVRC(DataDumperBase):
    def __init__(self, base_dir='/media/chunk/Elements/D/data/ImageNet/img/ILSVRC2013_DET_val', category='Train'):
        DataDumperBase.__init__(self, base_dir, category)

ec755e37   Chunk   cropping.
25
        self.base_dir = base_dir
d1042d03   Chunk   staged.
26
        self.category = category
080c30c2   Chunk   F5 lib updated. I...
27
28
        self.data_dir = os.path.join(self.base_dir, self.category)

84648488   Chunk   reverted.
29
        self.dst_dir = os.path.join(self.base_dir, 'dst', self.category)
d1042d03   Chunk   staged.
30
31
        self.list_file = os.path.join(self.dst_dir, 'file-tag.tsv')
        self.feat_dir = os.path.join(self.dst_dir, 'Feat')
080c30c2   Chunk   F5 lib updated. I...
32
33
        self.img_dir = os.path.join(self.dst_dir, 'Img')

2c2d57c7   Chunk   ILSVRC datapath h...
34
35
        self.dict_data = {}

84648488   Chunk   reverted.
36
        self.table_name = self.base_dir.strip('/').split('/')[-1] + '-' + self.category
2c2d57c7   Chunk   ILSVRC datapath h...
37
38
39
40
41
42
43
44
45
46
47
48
49
50
        self.sparkcontex = None

    def format(self):
        self.extract()

    def _hash_copy(self, image):
        if not image.endswith('jpg'):
            img = Image.open(image)
            img.save('res/tmp.jpg', format='JPEG')
            image = 'res/tmp.jpg'

        with open(image, 'rb') as f:
            index = md5(f.read()).hexdigest()

02528074   Chunk   staged.
51
        im = Jpeg(image, key=sample_key)
2c2d57c7   Chunk   ILSVRC datapath h...
52
53
        self.dict_data[index] = [im.image_width, im.image_height, os.path.getsize(image), im.getQuality()]

f1fa5b17   Chunk   review & streaming.
54
        # origion:
2c2d57c7   Chunk   ILSVRC datapath h...
55
56
57
58
59
        # dir = base_dir + 'Img/Train/' + index[:3]
        dir = os.path.join(self.img_dir, index[:3])
        if not os.path.exists(dir):
            os.makedirs(dir)
        image_path = os.path.join(dir, index[3:] + '.jpg')
080c30c2   Chunk   F5 lib updated. I...
60
61
        # print image_path

2c2d57c7   Chunk   ILSVRC datapath h...
62
63
64
65
66
        if not os.path.exists(image_path):
            shutil.copy(image, image_path)
        else:
            pass

84648488   Chunk   reverted.
67
    def _build_list(self):
9ff70cf4   Chunk   capacity engeneer...
68
        assert self.list_file != None
d0be60e7   Chunk   jpeg update.
69
70

        ordict_img = collections.OrderedDict(sorted(self.dict_data.items(), key=lambda d: d[0]))
2c2d57c7   Chunk   ILSVRC datapath h...
71
72

        with open(self.list_file, 'w') as f:
1dc7c44b   Chunk   crawler-hbase-spa...
73
            tsvfile = csv.writer(f, delimiter='\t')
2c2d57c7   Chunk   ILSVRC datapath h...
74
75
76
77
78
79
80
81
82
83
84
            for key, value in ordict_img.items():
                tsvfile.writerow([key] + value)


    def extract(self):
        for path, subdirs, files in os.walk(self.data_dir):
            for name in files:
                imagepath = os.path.join(path, name)
                print imagepath
                self._hash_copy(imagepath)
        self._build_list()
554a7b9a   Chunk   staged.
85

84648488   Chunk   reverted.
86
87
88
89
90

    def get_table(self):
        if self.table != None:
            return self.table

554a7b9a   Chunk   staged.
91
92
93
94
95
96
97
98
        if self.connection is None:
            c = happybase.Connection('HPC-server')
            self.connection = c

        tables = self.connection.tables()
        if self.table_name not in tables:
            families = {'cf_pic': dict(),
                        'cf_info': dict(max_versions=10),
84648488   Chunk   reverted.
99
                        'cf_tag': dict(),
554a7b9a   Chunk   staged.
100
                        'cf_feat': dict(),
f1fa5b17   Chunk   review & streaming.
101
                        }
84648488   Chunk   reverted.
102
103
104
            self.connection.create_table(name=self.table_name, families=families)

        table = self.connection.table(name=self.table_name)
554a7b9a   Chunk   staged.
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130

        self.table = table

        return table


    def store_image(self):
        if self.table == None:
            self.table = self.get_table()

        dict_databuf = {}

        with open(self.list_file, 'rb') as tsvfile:
            tsvfile = csv.reader(tsvfile, delimiter='\t')
            for line in tsvfile:
                path_img = os.path.join(self.img_dir, line[0][:3], line[0][3:] + '.jpg')
                if path_img:
                    with open(path_img, 'rb') as fpic:
                        dict_databuf[line[0] + '.jpg'] = fpic.read()

        try:
            with self.table.batch(batch_size=5000) as b:
                for imgname, imgdata in dict_databuf.items():
                    b.put(imgname, {'cf_pic:data': imgdata})
        except ValueError:
            raise
080c30c2   Chunk   F5 lib updated. I...
131
132
133
134
            pass


    def store_tag(self, tagtype='class'):
2c2d57c7   Chunk   ILSVRC datapath h...
135
136
137
        if self.table == None:
            self.table = self.get_table()

080c30c2   Chunk   F5 lib updated. I...
138
        dict_tagbuf = {}
2c2d57c7   Chunk   ILSVRC datapath h...
139
140
141
142

        with open(self.list_file, 'rb') as tsvfile:
            tsvfile = csv.reader(tsvfile, delimiter='\t')
            for line in tsvfile:
080c30c2   Chunk   F5 lib updated. I...
143
144
145
146
147
                dict_tagbuf[line[0] + '.jpg'] = line[-1]

        try:
            with self.table.batch(batch_size=5000) as b:
                for imgname, imgtag in dict_tagbuf.items():
84648488   Chunk   reverted.
148
                    b.put(imgname, {'cf_tag:' + tagtype: imgtag})
d1042d03   Chunk   staged.
149
        except ValueError:
9ff70cf4   Chunk   capacity engeneer...
150
            raise
9371f8fa   Chunk   SVM param engenee...
151
            pass
d1042d03   Chunk   staged.
152

9ff70cf4   Chunk   capacity engeneer...
153

d0be60e7   Chunk   jpeg update.
154
155
    def get_feat(self, image, feattype='ibd', **kwargs):
        size = kwargs.get('size', (48, 48))
d1042d03   Chunk   staged.
156

d0be60e7   Chunk   jpeg update.
157
        if feattype == 'hog':
2c2d57c7   Chunk   ILSVRC datapath h...
158
159
            feater = HOG.FeatHOG(size=size)
        elif feattype == 'ibd':
f1fa5b17   Chunk   review & streaming.
160
            feater = IntraBlockDiff.FeatIntraBlockDiff()
2c2d57c7   Chunk   ILSVRC datapath h...
161
162
163
        else:
            raise Exception("Unknown feature type!")

d1042d03   Chunk   staged.
164
165
166
167
168
        desc = feater.feat(image)

        return desc

    def extract_feat(self, feattype='ibd'):
2c2d57c7   Chunk   ILSVRC datapath h...
169

d1042d03   Chunk   staged.
170
171
        if feattype == 'hog':
            feater = HOG.FeatHOG(size=(48, 48))
2c2d57c7   Chunk   ILSVRC datapath h...
172
        elif feattype == 'ibd':
84648488   Chunk   reverted.
173
            feater = IntraBlockDiff.FeatIntraBlockDiff()
9ff70cf4   Chunk   capacity engeneer...
174
        else:
080c30c2   Chunk   F5 lib updated. I...
175
176
177
            raise Exception("Unknown feature type!")

        list_image = []
d0be60e7   Chunk   jpeg update.
178
        with open(self.list_file, 'rb') as tsvfile:
080c30c2   Chunk   F5 lib updated. I...
179
            tsvfile = csv.reader(tsvfile, delimiter='\t')
9ff70cf4   Chunk   capacity engeneer...
180
            for line in tsvfile:
84648488   Chunk   reverted.
181
                list_image.append(line[0])
080c30c2   Chunk   F5 lib updated. I...
182

d0be60e7   Chunk   jpeg update.
183
        dict_featbuf = {}
9ff70cf4   Chunk   capacity engeneer...
184
        for imgname in list_image:
84648488   Chunk   reverted.
185
            # if imgtag == 'True':
d0be60e7   Chunk   jpeg update.
186
            image = os.path.join(self.img_dir, imgname[:3], imgname[3:] + '.jpg')
080c30c2   Chunk   F5 lib updated. I...
187
188
            desc = feater.feat(image)
            dict_featbuf[imgname] = desc
84648488   Chunk   reverted.
189

9ff70cf4   Chunk   capacity engeneer...
190
        for imgname, desc in dict_featbuf.items():
d0be60e7   Chunk   jpeg update.
191
            # print imgname, desc
080c30c2   Chunk   F5 lib updated. I...
192
193
194
195
            dir = os.path.join(self.feat_dir, imgname[:3])
            if not os.path.exists(dir):
                os.makedirs(dir)
            featpath = os.path.join(dir, imgname[3:].split('.')[0] + '.' + feattype)
9ff70cf4   Chunk   capacity engeneer...
196
            with open(featpath, 'wb') as featfile:
080c30c2   Chunk   F5 lib updated. I...
197
198
199
200
201
                featfile.write(json.dumps(desc.tolist()))


    def store_feat(self, feattype='ibd'):
        if self.table == None:
080c30c2   Chunk   F5 lib updated. I...
202
203
            self.table = self.get_table()

d0be60e7   Chunk   jpeg update.
204
        dict_featbuf = {}
84648488   Chunk   reverted.
205
        for path, subdirs, files in os.walk(self.feat_dir):
d0be60e7   Chunk   jpeg update.
206
207
208
209
210
211
212
213
            for name in files:
                featpath = os.path.join(path, name)
                # print featpath
                with open(featpath, 'rb') as featfile:
                    imgname = path.split('/')[-1] + name.replace('.' + feattype, '.jpg')
                    dict_featbuf[imgname] = featfile.read()

        try:
080c30c2   Chunk   F5 lib updated. I...
214
            with self.table.batch(batch_size=5000) as b:
9ff70cf4   Chunk   capacity engeneer...
215
216
217
218
219
                for imgname, featdesc in dict_featbuf.items():
                    b.put(imgname, {'cf_feat:' + feattype: featdesc})
        except ValueError:
            raise
            pass
84648488   Chunk   reverted.
220

9ff70cf4   Chunk   capacity engeneer...
221
222
223
224
225
226
227
228
229
    def load_data(self, mode='local', feattype='ibd', tagtype='class'):
        INDEX = []
        X = []
        Y = []

        if mode == "local":

            dict_tagbuf = {}
            with open(self.list_file, 'rb') as tsvfile:
84648488   Chunk   reverted.
230
                tsvfile = csv.reader(tsvfile, delimiter='\t')
9ff70cf4   Chunk   capacity engeneer...
231
232
                for line in tsvfile:
                    imgname = line[0] + '.jpg'
9371f8fa   Chunk   SVM param engenee...
233
                    dict_tagbuf[imgname] = line[-1]
9ff70cf4   Chunk   capacity engeneer...
234
235
236
237
238
239

            dict_dataset = {}
            for path, subdirs, files in os.walk(self.feat_dir):
                for name in files:
                    featpath = os.path.join(path, name)
                    with open(featpath, 'rb') as featfile:
84648488   Chunk   reverted.
240
                        imgname = path.split('/')[-1] + name.replace('.' + feattype, '.jpg')
9ff70cf4   Chunk   capacity engeneer...
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
                        dict_dataset[imgname] = json.loads(featfile.read())

            for imgname, tag in dict_tagbuf.items():
                tag = 1 if tag == 'True' else 0
                INDEX.append(imgname)
                X.append(dict_dataset[imgname])
                Y.append(tag)

        elif mode == "remote" or mode == "hbase":
            if self.table == None:
                self.table = self.get_table()

            col_feat, col_tag = 'cf_feat:' + feattype, 'cf_tag:' + tagtype
            for key, data in self.table.scan(columns=[col_feat, col_tag]):
                X.append(json.loads(data[col_feat]))
                Y.append(1 if data[col_tag] == 'True' else 0)

        elif mode == "spark" or mode == "cluster":
84648488   Chunk   reverted.
259
            if self.sparkcontex == None:
9ff70cf4   Chunk   capacity engeneer...
260
261
262
263
264
265
266
267
268
269
                self.sparkcontex = SC.Sparker(host='HPC-server', appname='ImageCV', master='spark://HPC-server:7077')

            result = self.sparkcontex.read_habase(self.table_name)  # result = {key:[feat,tag],...}
            for feat, tag in result:
                X.append(feat)
                Y.append(tag)
        else:
            raise Exception("Unknown mode!")

        return X, Y
554a7b9a   Chunk   staged.

f1fa5b17   Chunk   review & streaming.

9371f8fa   Chunk   SVM param engenee...

9ff70cf4   Chunk   capacity engeneer...

84648488   Chunk   reverted.

ec755e37   Chunk   cropping.

e6be6b61   Chunk   import caffe.

ec755e37   Chunk   cropping.

b9990e77   Chunk   staged.

ec755e37   Chunk   cropping.

e6be6b61   Chunk   import caffe.

ec755e37   Chunk   cropping.

b9990e77   Chunk   staged.

25c0c9c9   Chunk   feat.ravel()[[i*3...

e6be6b61   Chunk   import caffe.

bde8352b   Chunk   shuffling.

ec755e37   Chunk   cropping.

bde8352b   Chunk   shuffling.

ec755e37   Chunk   cropping.

84648488   Chunk   reverted.

2c2d57c7   Chunk   ILSVRC datapath h...

f1fa5b17   Chunk   review & streaming.

2c2d57c7   Chunk   ILSVRC datapath h...

f4fb4381   Chunk   staged.

2c2d57c7   Chunk   ILSVRC datapath h...

51708346   Chunk   final experiments...

2c2d57c7   Chunk   ILSVRC datapath h...

d47ae6ce   Chunk   staged.

f1fa5b17   Chunk   review & streaming.

d47ae6ce   Chunk   staged.

2c2d57c7   Chunk   ILSVRC datapath h...

ad70caf6   Chunk   staged.

2c2d57c7   Chunk   ILSVRC datapath h...

24768a99   Chunk   mode 'hbase' fini...

2c2d57c7   Chunk   ILSVRC datapath h...

489c5608   Chunk   debugging...

2c2d57c7   Chunk   ILSVRC datapath h...

2c2d57c7   Chunk   ILSVRC datapath h...

84648488   Chunk   reverted.

cb798a7f   Chunk   libs & scripts in...

080c30c2   Chunk   F5 lib updated. I...

cb798a7f   Chunk   libs & scripts in...

84648488   Chunk   reverted.

554a7b9a   Chunk   staged.

cb798a7f   Chunk   libs & scripts in...

cb798a7f   Chunk   libs & scripts in...

080c30c2   Chunk   F5 lib updated. I...

cb798a7f   Chunk   libs & scripts in...

84648488   Chunk   reverted.

080c30c2   Chunk   F5 lib updated. I...

2c2d57c7   Chunk   ILSVRC datapath h...

080c30c2   Chunk   F5 lib updated. I...

2c2d57c7   Chunk   ILSVRC datapath h...

080c30c2   Chunk   F5 lib updated. I...

080c30c2   Chunk   F5 lib updated. I...

2c2d57c7   Chunk   ILSVRC datapath h...

84648488   Chunk   reverted.

2c2d57c7   Chunk   ILSVRC datapath h...

84648488   Chunk   reverted.

bde8352b   Chunk   shuffling.

f1fa5b17   Chunk   review & streaming.

2c2d57c7   Chunk   ILSVRC datapath h...

d0be60e7   Chunk   jpeg update.

ec755e37   Chunk   cropping.

bbd2f705   Chunk   cropping.

ec755e37   Chunk   cropping.

b9990e77   Chunk   staged.

84648488   Chunk   reverted.

bde8352b   Chunk   shuffling.

e6be6b61   Chunk   import caffe.

b9990e77   Chunk   staged.

ec755e37   Chunk   cropping.

d0be60e7   Chunk   jpeg update.

b9990e77   Chunk   staged.

2c2d57c7   Chunk   ILSVRC datapath h...

ec755e37   Chunk   cropping.

2c2d57c7   Chunk   ILSVRC datapath h...

ec755e37   Chunk   cropping.

84648488   Chunk   reverted.

02528074   Chunk   staged.

2c2d57c7   Chunk   ILSVRC datapath h...

ec755e37   Chunk   cropping.

02528074   Chunk   staged.

84648488   Chunk   reverted.

2c2d57c7   Chunk   ILSVRC datapath h...

02528074   Chunk   staged.

2c2d57c7   Chunk   ILSVRC datapath h...

cb798a7f   Chunk   libs & scripts in...

2c2d57c7   Chunk   ILSVRC datapath h...

bde8352b   Chunk   shuffling.

2c2d57c7   Chunk   ILSVRC datapath h...

bde8352b   Chunk   shuffling.

84648488   Chunk   reverted.