Commit 0d9a20eae7c4bfe5b95f8af39b182ec53e01b979
1 parent
c7fa1d60
Exists in
master
and in
2 other branches
staged.
Showing
10 changed files
with
166 additions
and
143 deletions
Show diff stats
common.pyc
No preview for this file type
mdata/CV.py
1 | 1 | __author__ = 'chunk' |
2 | 2 | |
3 | 3 | from mdata import * |
4 | -from mfeat import * | |
4 | +from mfeat import HOG | |
5 | 5 | |
6 | 6 | import os, sys |
7 | 7 | from PIL import Image |
... | ... | @@ -15,11 +15,9 @@ import happybase |
15 | 15 | |
16 | 16 | |
17 | 17 | class DataCV(DataDumperBase): |
18 | - def __init__(self, base_dir='/home/hadoop/data/HeadShoulder/', sub_dir='Train/'): | |
19 | - DataDumperBase.__init__(self) | |
20 | - self.base_dir = base_dir | |
21 | - self.sub_dir = sub_dir | |
22 | - | |
18 | + def __init__(self, base_dir='/home/hadoop/data/HeadShoulder/', category='Train'): | |
19 | + DataDumperBase.__init__(self, base_dir, category) | |
20 | + self.data_dir = self.base_dir + self.category + '/' | |
23 | 21 | self.dict_data = {} |
24 | 22 | |
25 | 23 | def format(self): |
... | ... | @@ -38,20 +36,20 @@ class DataCV(DataDumperBase): |
38 | 36 | |
39 | 37 | # origion: |
40 | 38 | # dir = base_dir + 'Img/Train/' + index[:3] |
41 | - dir = self.base_dir + self.sub_dir + 'Img/' + index[:3] | |
39 | + dir = self.img_dir + index[:3] + '/' | |
42 | 40 | if not os.path.exists(dir): |
43 | 41 | os.makedirs(dir) |
44 | - path = dir + '/' + index[3:] + '.jpg' | |
45 | - print path | |
42 | + image = dir + index[3:] + '.jpg' | |
43 | + print image | |
46 | 44 | |
47 | - if not os.path.exists(path): | |
48 | - shutil.copy(image, path) | |
45 | + if not os.path.exists(image): | |
46 | + shutil.copy(image, image) | |
49 | 47 | else: |
50 | 48 | pass |
51 | 49 | |
52 | 50 | |
53 | 51 | def extract(self): |
54 | - for path, subdirs, files in os.walk(self.base_dir + 'Orig/'): | |
52 | + for path, subdirs, files in os.walk(self.data_dir): | |
55 | 53 | for name in files: |
56 | 54 | imagepath = os.path.join(path, name) |
57 | 55 | print imagepath |
... | ... | @@ -62,136 +60,121 @@ class DataCV(DataDumperBase): |
62 | 60 | |
63 | 61 | ordict_img = collections.OrderedDict(sorted(self.dict_data.items(), key=lambda d: d[0])) |
64 | 62 | |
65 | - lstfile = self.base_dir + self.sub_dir + 'Img/Image.tsv' | |
66 | - with open(lstfile, 'w') as f: | |
63 | + with open(self.list_file, 'w') as f: | |
67 | 64 | tsvfile = csv.writer(f, delimiter='\t') |
68 | 65 | for key, value in ordict_img.items(): |
69 | 66 | tsvfile.writerow([key] + [value]) |
70 | 67 | |
68 | + def get_table(self): | |
69 | + if self.table != None: | |
70 | + return self.table | |
71 | 71 | |
72 | - def get_table(self, tablename, connection=None): | |
73 | - if connection is not None: | |
74 | - c = connection | |
75 | - else: | |
72 | + if self.connection is None: | |
76 | 73 | c = happybase.Connection('HPC-server') |
77 | - tables = c.tables() | |
78 | - if tablename not in tables: | |
74 | + self.connection = c | |
75 | + | |
76 | + tables = self.connection.tables() | |
77 | + if self.table_name not in tables: | |
79 | 78 | families = {'cf_pic': dict(), |
80 | 79 | 'cf_info': dict(max_versions=10), |
81 | 80 | 'cf_tag': dict(), |
82 | 81 | 'cf_feat': dict(), |
83 | 82 | } |
84 | - c.create_table(name=tablename, families=families) | |
83 | + self.connection.create_table(name=self.table_name, families=families) | |
84 | + | |
85 | + table = self.connection.table(name=self.table_name) | |
86 | + | |
87 | + self.table = table | |
85 | 88 | |
86 | - tb = c.table(name=tablename) | |
87 | - return tb | |
89 | + return table | |
88 | 90 | |
89 | 91 | |
90 | - def store_image(self, table): | |
91 | - timer.mark() | |
92 | - dir = self.base_dir + self.sub_dir + 'Img/' | |
93 | - maplst = dir + 'Image.tsv' | |
92 | + def store_image(self): | |
93 | + if self.table == None: | |
94 | + self.table = self.get_table() | |
94 | 95 | |
95 | 96 | dict_databuf = {} |
96 | 97 | |
97 | - with open(maplst, 'rb') as tsvfile: | |
98 | + with open(self.list_file, 'rb') as tsvfile: | |
98 | 99 | tsvfile = csv.reader(tsvfile, delimiter='\t') |
99 | 100 | for line in tsvfile: |
100 | - path_img = self.base_dir + self.sub_dir + 'Img/Train/' + line[0][:3] + '/' + line[0][3:] + '.jpg' | |
101 | + path_img = self.img_dir + + line[0][:3] + '/' + line[0][3:] + '.jpg' | |
101 | 102 | if path_img: |
102 | 103 | with open(path_img, 'rb') as fpic: |
103 | 104 | dict_databuf[line[0] + '.jpg'] = fpic.read() |
104 | 105 | |
105 | - timer.report() # 58.761801s | |
106 | - timer.mark() | |
107 | 106 | try: |
108 | - with table.batch(batch_size=5000) as b: | |
107 | + with self.table.batch(batch_size=5000) as b: | |
109 | 108 | for imgname, imgdata in dict_databuf.items(): |
110 | 109 | b.put(imgname, {'cf_pic:data': imgdata}) |
111 | 110 | raise ValueError("Something went wrong!") |
112 | 111 | except ValueError: |
113 | 112 | pass |
114 | - timer.report() # 15.570524s | |
115 | 113 | |
116 | 114 | |
117 | - def store_tag(self, table): | |
118 | - timer.mark() | |
119 | - dir = self.base_dir + self.sub_dir + 'Img/' | |
120 | - maplst = dir + 'Image.tsv' | |
115 | + def store_tag(self, feattype='hog'): | |
116 | + if self.table == None: | |
117 | + self.table = self.get_table() | |
121 | 118 | |
122 | 119 | dict_tagbuf = {} |
123 | 120 | |
124 | - with open(maplst, 'rb') as tsvfile: | |
121 | + with open(self.list_file, 'rb') as tsvfile: | |
125 | 122 | tsvfile = csv.reader(tsvfile, delimiter='\t') |
126 | 123 | for line in tsvfile: |
127 | 124 | dict_tagbuf[line[0] + '.jpg'] = line[1] |
128 | 125 | |
129 | - timer.report() # 0.009741s | |
130 | - timer.mark() | |
131 | 126 | try: |
132 | - with table.batch(batch_size=5000) as b: | |
127 | + with self.table.batch(batch_size=5000) as b: | |
133 | 128 | for imgname, imgtag in dict_tagbuf.items(): |
134 | - b.put(imgname, {'cf_tag:class': imgtag}) | |
129 | + b.put(imgname, {'cf_tag:' + feattype: imgtag}) | |
135 | 130 | raise ValueError("Something went wrong!") |
136 | 131 | except ValueError: |
137 | 132 | pass |
138 | - timer.report() # 0.509696s | |
139 | 133 | |
140 | 134 | |
141 | - def get_feat(self, category='hog'): | |
142 | - dir = self.base_dir + self.sub_dir + 'Img/' | |
143 | - maplst = dir + 'images_map_Train.tsv' | |
135 | + def get_feat(self, feattype='hog'): | |
144 | 136 | |
145 | 137 | dict_tagbuf = {} |
146 | - | |
147 | - with open(maplst, 'rb') as tsvfile: | |
138 | + with open(self.list_file, 'rb') as tsvfile: | |
148 | 139 | tsvfile = csv.reader(tsvfile, delimiter='\t') |
149 | 140 | for line in tsvfile: |
150 | 141 | dict_tagbuf[line[0] + '.jpg'] = line[1] |
151 | 142 | |
152 | 143 | dict_featbuf = {} |
153 | - | |
154 | - timer.mark() | |
155 | 144 | for imgname, imgtag in dict_tagbuf.items(): |
156 | 145 | # if imgtag == 'True': |
157 | - path_img = self.base_dir + self.sub_dir + 'Img/Train/' + imgname[:3] + '/' + imgname[3:] | |
158 | - desc = FeatHOG.feat(path_img, size=(48, 48)) | |
146 | + image = self.img_dir + imgname[:3] + '/' + imgname[3:] | |
147 | + desc = HOG.FeatHOG.feat(image, size=(48, 48)) | |
159 | 148 | dict_featbuf[imgname] = desc |
160 | - timer.report() # 4.337425s | |
161 | 149 | |
162 | - timer.mark() | |
163 | 150 | for imgname, desc in dict_featbuf.items(): |
164 | 151 | # print imgname, desc |
165 | - dir = self.base_dir + self.sub_dir + 'Feat/Train/' + imgname[:3] + '/' | |
152 | + dir = self.feat_dir + imgname[:3] + '/' | |
166 | 153 | if not os.path.exists(dir): |
167 | 154 | os.makedirs(dir) |
168 | - featpath = dir + imgname[3:].split('.')[0] + '.' + category | |
155 | + featpath = dir + imgname[3:].split('.')[0] + '.' + feattype | |
169 | 156 | with open(featpath, 'wb') as featfile: |
170 | 157 | featfile.write(json.dumps(desc.tolist())) |
171 | 158 | |
172 | - timer.report() # 14.862485s | |
173 | 159 | |
160 | + def store_feat(self, feattype='hog'): | |
161 | + if self.table == None: | |
162 | + self.table = self.get_table() | |
174 | 163 | |
175 | - def store_feat(self, table): | |
176 | - timer.mark() | |
177 | - dir = self.base_dir + self.sub_dir + 'Feat/' | |
178 | 164 | dict_featbuf = {} |
179 | - for path, subdirs, files in os.walk(dir + 'Train/'): | |
165 | + for path, subdirs, files in os.walk(self.feat_dir): | |
180 | 166 | for name in files: |
181 | 167 | featpath = os.path.join(path, name) |
182 | 168 | # print featpath |
183 | 169 | with open(featpath, 'rb') as featfile: |
184 | - imgname = path.split('/')[-1] + name.replace('.hog', '.jpg') | |
170 | + imgname = path.split('/')[-1] + name.replace('.' + feattype, '.jpg') | |
185 | 171 | dict_featbuf[imgname] = featfile.read() |
186 | 172 | |
187 | - timer.report() # 0.577940s | |
188 | - | |
189 | - timer.mark() | |
190 | 173 | try: |
191 | - with table.batch(batch_size=5000) as b: | |
174 | + with self.table.batch(batch_size=5000) as b: | |
192 | 175 | for imgname, featdesc in dict_featbuf.items(): |
193 | - b.put(imgname, {'cf_feat:hog': featdesc}) | |
176 | + b.put(imgname, {'cf_feat:' + feattype: featdesc}) | |
194 | 177 | raise ValueError("Something went wrong!") |
195 | 178 | except ValueError: |
196 | 179 | pass |
197 | - timer.report() # 76.075477s | |
198 | 180 | \ No newline at end of file |
181 | + | ... | ... |
mdata/MSR.py
... | ... | @@ -19,11 +19,15 @@ import happybase |
19 | 19 | |
20 | 20 | |
21 | 21 | class DataMSR(DataDumperBase): |
22 | - def __init__(self, base_dir='/home/hadoop/data/MSR-IRC2014/', sub_dir='Dev/', data_file='DevSetImage.tsv'): | |
23 | - DataDumperBase.__init__(self) | |
24 | - self.base_dir = base_dir | |
25 | - self.sub_dir = sub_dir | |
26 | - self.data_file = self.base_dir + self.sub_dir + data_file | |
22 | + def __init__(self, base_dir='/home/hadoop/data/MSR-IRC2014/', category='Dev', | |
23 | + data_file='DevSetImage.tsv', tag_file='DevSetLabel.tsv'): | |
24 | + DataDumperBase.__init__(self, base_dir, category) | |
25 | + | |
26 | + self.data_file = self.base_dir + self.category + '/' + data_file | |
27 | + self.tag_file = self.base_dir + self.category + '/' + tag_file | |
28 | + self.map_file = self.base_dir + self.category + '/' + 'images_map.tsv' | |
29 | + | |
30 | + self.table_name = self.base_dir.split('/')[-2] + '-' + self.category | |
27 | 31 | |
28 | 32 | def format(self): |
29 | 33 | self.extract() |
... | ... | @@ -44,105 +48,105 @@ class DataMSR(DataDumperBase): |
44 | 48 | with open('res/tmp.jpg', 'rb') as f: |
45 | 49 | index = md5(f.read()).hexdigest() |
46 | 50 | |
47 | - dir = self.base_dir + self.sub_dir + 'Img/' + index[:3] | |
51 | + dir = self.img_dir + index[:3] + '/' | |
48 | 52 | if not os.path.exists(dir): |
49 | 53 | os.makedirs(dir) |
50 | - path = dir + '/' + index[3:] + '.jpg' | |
51 | - print path | |
54 | + image = dir + index[3:] + '.jpg' | |
55 | + print image | |
52 | 56 | |
53 | - if not os.path.exists(path): | |
54 | - shutil.copy('res/tmp.jpg', path) | |
57 | + if not os.path.exists(image): | |
58 | + shutil.copy('res/tmp.jpg', image) | |
55 | 59 | # or : |
56 | - # img.save(path, format='JPEG') | |
60 | + # img.save(image, format='JPEG') | |
57 | 61 | |
58 | 62 | |
59 | 63 | def extract(self): |
60 | - for name, data in self.load_base64(): | |
61 | - self.hash_dump(data) | |
64 | + for name, data in self._load_base64(): | |
65 | + self._hash_dump(data) | |
62 | 66 | |
63 | 67 | |
64 | 68 | def build_list(self): |
65 | - dir = self.base_dir + self.sub_dir | |
66 | - lst = dir + 'Image.lst' | |
67 | - with open(lst, 'wb') as f: | |
68 | - for path, subdirs, files in os.walk(dir): | |
69 | + assert self.list_file != None | |
70 | + with open(self.list_file, 'wb') as f: | |
71 | + for path, subdirs, files in os.walk(self.img_dir): | |
69 | 72 | for name in files: |
70 | 73 | entry = path.split('/')[-1] + '/' + name |
71 | 74 | print entry |
72 | 75 | f.write(entry + '\n') |
73 | 76 | |
74 | 77 | |
75 | - def get_table(self, tablename, connection=None): | |
76 | - if connection is not None: | |
77 | - c = connection | |
78 | - else: | |
78 | + def get_table(self): | |
79 | + if self.table != None: | |
80 | + return self.table | |
81 | + | |
82 | + if self.connection is None: | |
79 | 83 | c = happybase.Connection('HPC-server') |
80 | - tables = c.tables() | |
81 | - if tablename not in tables: | |
84 | + self.connection = c | |
85 | + | |
86 | + tables = self.connection.tables() | |
87 | + if self.table_name not in tables: | |
82 | 88 | families = {'cf_pic': dict(), |
83 | 89 | 'cf_info': dict(max_versions=10), |
84 | 90 | 'cf_tag': dict(), |
91 | + 'cf_feat': dict(), | |
85 | 92 | } |
86 | - c.create_table(name=tablename, families=families) | |
93 | + self.connection.create_table(name=self.table_name, families=families) | |
94 | + | |
95 | + table = self.connection.table(name=self.table_name) | |
87 | 96 | |
88 | - tb = c.table(name=tablename) | |
89 | - return tb | |
97 | + self.table = table | |
90 | 98 | |
99 | + return table | |
100 | + | |
101 | + | |
102 | + def store_image(self): | |
103 | + if self.table == None: | |
104 | + self.table = self.get_table() | |
91 | 105 | |
92 | - def store_image(self, table): | |
93 | - timer.mark() | |
94 | - dir = self.base_dir + self.sub_dir + 'Img2/' | |
95 | - lst = dir + 'Image.lst' | |
96 | 106 | dict_buffer = {} |
97 | - with open(lst, 'rb') as f: | |
107 | + with open(self.list_file, 'rb') as f: | |
98 | 108 | for line in f: |
99 | 109 | path_img = line.strip('\n') |
100 | 110 | if path_img: |
101 | - with open(dir + 'Dev/' + path_img, 'rb') as fpic: | |
111 | + with open(self.img_dir + path_img, 'rb') as fpic: | |
102 | 112 | dict_buffer[path_img.replace('/', '')] = fpic.read() |
103 | - timer.report() # 1.507566s | |
104 | - timer.mark() | |
113 | + | |
105 | 114 | try: |
106 | - with table.batch(batch_size=5000) as b: | |
115 | + with self.table.batch(batch_size=5000) as b: | |
107 | 116 | for imgname, imgdata in dict_buffer.items(): |
108 | 117 | b.put(imgname, {'cf_pic:data': imgdata}) |
109 | 118 | raise ValueError("Something went wrong!") |
110 | 119 | except ValueError: |
111 | 120 | pass |
112 | - timer.report() # 228.003684s | |
113 | 121 | |
114 | 122 | |
115 | - def store_tag(self, table): | |
116 | - timer.mark() | |
117 | - dir = self.base_dir + self.sub_dir + 'Img2/' | |
118 | - maplst = dir + 'Image.tsv' | |
119 | - taglist = self.base_dir + self.sub_dir + 'Dev/DevSetLabel.tsv' | |
123 | + def store_tag(self): | |
124 | + if self.table == None: | |
125 | + self.table = self.get_table() | |
126 | + | |
120 | 127 | dict_namebuf = {} |
121 | 128 | dict_tagbuf = {} |
122 | 129 | |
123 | - with open(maplst, 'rb') as tsvfile: | |
130 | + with open(self.map_file, 'rb') as tsvfile: | |
124 | 131 | tsvfile = csv.reader(tsvfile, delimiter='\t') |
125 | 132 | for line in tsvfile: |
126 | 133 | dict_namebuf[line[0]] = line[2] |
127 | 134 | |
128 | - with open(taglist, 'rb') as tsvfile: | |
135 | + with open(self.tag_file, 'rb') as tsvfile: | |
129 | 136 | tsvfile = csv.reader(tsvfile, delimiter='\t') |
130 | 137 | for line in tsvfile: |
131 | 138 | dict_tagbuf[line[-2]] = (line[:-2], line[-1]) |
132 | 139 | |
133 | - timer.report() # 0.148540s | |
134 | - timer.mark() | |
135 | 140 | try: |
136 | - with table.batch(batch_size=5000) as b: | |
141 | + with self.table.batch(batch_size=5000) as b: | |
137 | 142 | for key, value in dict_tagbuf.items(): |
138 | 143 | b.put(dict_namebuf[key] + '.jpg', {'cf_tag:' + ''.join(value[0]): value[1]}) |
139 | 144 | raise ValueError("Something went wrong!") |
140 | 145 | except ValueError: |
141 | 146 | pass |
142 | - timer.report() # 3.280105s | |
143 | 147 | |
144 | - def get_feat(self, category): | |
148 | + def get_feat(self, feattype): | |
145 | 149 | pass |
146 | 150 | |
147 | - def store_feat(self, table, category): | |
151 | + def store_feat(self, feattype): | |
148 | 152 | pass | ... | ... |
No preview for this file type
mdata/__init__.py
1 | +# -*- coding: utf-8 -*- | |
1 | 2 | __author__ = 'chunk' |
2 | 3 | |
3 | -__all__ = ['DataDumperBase', ] | |
4 | +__all__ = ['DataDumperBase'] | |
4 | 5 | |
5 | 6 | |
6 | 7 | class DataDumperBase(object): |
... | ... | @@ -8,17 +9,33 @@ class DataDumperBase(object): |
8 | 9 | Base class for image data dumping & retrieving. |
9 | 10 | A regular directory pattern would be like this: |
10 | 11 | |
11 | - ├── file-tag-list.tsv | |
12 | - │ | |
13 | - ├── Feat | |
14 | - │ ├── 0a1 | |
15 | - │ └── 53e | |
16 | - │ └── ... | |
17 | - | | |
18 | - └── Img | |
19 | - ├── 0a1 | |
20 | - └── 53e | |
21 | - └── ... | |
12 | + ├── Dev (category) | |
13 | + ├── file-tag.tsv (list_file) | |
14 | + │ | |
15 | + ├── Feat (feat_dir) | |
16 | + │ ├── 0a1 | |
17 | + │ └── 53e | |
18 | + │ └── ... | |
19 | + | | |
20 | + └── Img (img_dir) | |
21 | + ├── 0a1 | |
22 | + └── 53e | |
23 | + └── ... | |
24 | + ├── Train (category) | |
25 | + ├── file-tag.tsv (list_file) | |
26 | + │ | |
27 | + ├── Feat | |
28 | + │ ├── 032 | |
29 | + │ └── a21 | |
30 | + │ └── ... | |
31 | + | | |
32 | + └── Img | |
33 | + ├── 032 | |
34 | + └── a21 | |
35 | + └── ... | |
36 | + . | |
37 | + . | |
38 | + . | |
22 | 39 | |
23 | 40 | It can be refractored from the original pattern which is supposed to be generated from web crawlers: |
24 | 41 | |
... | ... | @@ -32,38 +49,47 @@ class DataDumperBase(object): |
32 | 49 | └── ddd.jpg |
33 | 50 | └── ... |
34 | 51 | |
52 | + | |
35 | 53 | convention: |
36 | 54 | 'img' for image file data while 'image' for file path; |
37 | 55 | |
38 | 56 | """ |
39 | 57 | |
40 | - def __init__(self): | |
41 | - self.base_dir = None | |
42 | - self.list_file = None | |
43 | - self.dict_data = None | |
58 | + def __init__(self, base_dir, category): | |
59 | + """ | |
60 | + base_dir: e.g. '/home/hadoop/data/MSR-IRC2014/' | |
61 | + list_file: not data_file! e.g. 'file-tag.tsv' | |
62 | + dict_data: e.g. {'filename':rawdata} or {'filename':tag} | |
63 | + """ | |
64 | + self.base_dir = base_dir | |
65 | + self.category = category | |
66 | + self.dst_dir = self.base_dir + 'dst/' + self.category + '/' | |
67 | + | |
68 | + self.list_file = self.dst_dir + 'file-tag.tsv' | |
69 | + self.feat_dir = self.dst_dir + 'Feat/' | |
70 | + self.img_dir = self.dst_dir + 'Img/' | |
44 | 71 | |
45 | - # self.table_name = None | |
46 | - # self.table = None | |
47 | - # self.connection = None | |
72 | + self.table_name = None | |
73 | + self.table = None | |
74 | + self.connection = None | |
48 | 75 | |
49 | 76 | def format(self): |
50 | 77 | pass |
51 | 78 | |
52 | 79 | |
53 | - | |
54 | - def get_table(self, tablename, connection=None): | |
80 | + def get_table(self): | |
55 | 81 | pass |
56 | 82 | |
57 | - def store_img(self, table): | |
83 | + def store_img(self): | |
58 | 84 | pass |
59 | 85 | |
60 | - def store_tag(self, table, category): | |
86 | + def store_tag(self, feattype): | |
61 | 87 | pass |
62 | 88 | |
63 | - def get_feat(self, category): | |
89 | + def get_feat(self, feattype): | |
64 | 90 | pass |
65 | 91 | |
66 | - def store_feat(self, table, category): | |
92 | + def store_feat(self, feattype): | |
67 | 93 | pass |
68 | 94 | |
69 | 95 | ... | ... |
No preview for this file type
mfeat/__init__.py
No preview for this file type
res/tmp.jpg
... | ... | @@ -0,0 +1,10 @@ |
1 | +__author__ = 'chunk' | |
2 | + | |
3 | + | |
4 | +from mdata import MSR | |
5 | + | |
6 | +msrd = MSR.DataMSR(base_dir='/media/chunk/Elements/D/data/MSR-IRC2014/',category='Train',data_file='TrainImageSet.tsv', tag_file='TrainSetLabel.tsv') | |
7 | +# msrd.format() | |
8 | +# msrd.build_list() | |
9 | + | |
10 | +print 'helllo' | |
0 | 11 | \ No newline at end of file | ... | ... |