Commit 0d9a20eae7c4bfe5b95f8af39b182ec53e01b979

Authored by Chunk
1 parent c7fa1d60

staged.

common.pyc
No preview for this file type
mdata/CV.py
1 1 __author__ = 'chunk'
2 2  
3 3 from mdata import *
4   -from mfeat import *
  4 +from mfeat import HOG
5 5  
6 6 import os, sys
7 7 from PIL import Image
... ... @@ -15,11 +15,9 @@ import happybase
15 15  
16 16  
17 17 class DataCV(DataDumperBase):
18   - def __init__(self, base_dir='/home/hadoop/data/HeadShoulder/', sub_dir='Train/'):
19   - DataDumperBase.__init__(self)
20   - self.base_dir = base_dir
21   - self.sub_dir = sub_dir
22   -
  18 + def __init__(self, base_dir='/home/hadoop/data/HeadShoulder/', category='Train'):
  19 + DataDumperBase.__init__(self, base_dir, category)
  20 + self.data_dir = self.base_dir + self.category + '/'
23 21 self.dict_data = {}
24 22  
25 23 def format(self):
... ... @@ -38,20 +36,20 @@ class DataCV(DataDumperBase):
38 36  
39 37 # origion:
40 38 # dir = base_dir + 'Img/Train/' + index[:3]
41   - dir = self.base_dir + self.sub_dir + 'Img/' + index[:3]
  39 + dir = self.img_dir + index[:3] + '/'
42 40 if not os.path.exists(dir):
43 41 os.makedirs(dir)
44   - path = dir + '/' + index[3:] + '.jpg'
45   - print path
  42 + image = dir + index[3:] + '.jpg'
  43 + print image
46 44  
47   - if not os.path.exists(path):
48   - shutil.copy(image, path)
  45 + if not os.path.exists(image):
  46 + shutil.copy(image, image)
49 47 else:
50 48 pass
51 49  
52 50  
53 51 def extract(self):
54   - for path, subdirs, files in os.walk(self.base_dir + 'Orig/'):
  52 + for path, subdirs, files in os.walk(self.data_dir):
55 53 for name in files:
56 54 imagepath = os.path.join(path, name)
57 55 print imagepath
... ... @@ -62,136 +60,121 @@ class DataCV(DataDumperBase):
62 60  
63 61 ordict_img = collections.OrderedDict(sorted(self.dict_data.items(), key=lambda d: d[0]))
64 62  
65   - lstfile = self.base_dir + self.sub_dir + 'Img/Image.tsv'
66   - with open(lstfile, 'w') as f:
  63 + with open(self.list_file, 'w') as f:
67 64 tsvfile = csv.writer(f, delimiter='\t')
68 65 for key, value in ordict_img.items():
69 66 tsvfile.writerow([key] + [value])
70 67  
  68 + def get_table(self):
  69 + if self.table != None:
  70 + return self.table
71 71  
72   - def get_table(self, tablename, connection=None):
73   - if connection is not None:
74   - c = connection
75   - else:
  72 + if self.connection is None:
76 73 c = happybase.Connection('HPC-server')
77   - tables = c.tables()
78   - if tablename not in tables:
  74 + self.connection = c
  75 +
  76 + tables = self.connection.tables()
  77 + if self.table_name not in tables:
79 78 families = {'cf_pic': dict(),
80 79 'cf_info': dict(max_versions=10),
81 80 'cf_tag': dict(),
82 81 'cf_feat': dict(),
83 82 }
84   - c.create_table(name=tablename, families=families)
  83 + self.connection.create_table(name=self.table_name, families=families)
  84 +
  85 + table = self.connection.table(name=self.table_name)
  86 +
  87 + self.table = table
85 88  
86   - tb = c.table(name=tablename)
87   - return tb
  89 + return table
88 90  
89 91  
90   - def store_image(self, table):
91   - timer.mark()
92   - dir = self.base_dir + self.sub_dir + 'Img/'
93   - maplst = dir + 'Image.tsv'
  92 + def store_image(self):
  93 + if self.table == None:
  94 + self.table = self.get_table()
94 95  
95 96 dict_databuf = {}
96 97  
97   - with open(maplst, 'rb') as tsvfile:
  98 + with open(self.list_file, 'rb') as tsvfile:
98 99 tsvfile = csv.reader(tsvfile, delimiter='\t')
99 100 for line in tsvfile:
100   - path_img = self.base_dir + self.sub_dir + 'Img/Train/' + line[0][:3] + '/' + line[0][3:] + '.jpg'
  101 + path_img = self.img_dir + + line[0][:3] + '/' + line[0][3:] + '.jpg'
101 102 if path_img:
102 103 with open(path_img, 'rb') as fpic:
103 104 dict_databuf[line[0] + '.jpg'] = fpic.read()
104 105  
105   - timer.report() # 58.761801s
106   - timer.mark()
107 106 try:
108   - with table.batch(batch_size=5000) as b:
  107 + with self.table.batch(batch_size=5000) as b:
109 108 for imgname, imgdata in dict_databuf.items():
110 109 b.put(imgname, {'cf_pic:data': imgdata})
111 110 raise ValueError("Something went wrong!")
112 111 except ValueError:
113 112 pass
114   - timer.report() # 15.570524s
115 113  
116 114  
117   - def store_tag(self, table):
118   - timer.mark()
119   - dir = self.base_dir + self.sub_dir + 'Img/'
120   - maplst = dir + 'Image.tsv'
  115 + def store_tag(self, feattype='hog'):
  116 + if self.table == None:
  117 + self.table = self.get_table()
121 118  
122 119 dict_tagbuf = {}
123 120  
124   - with open(maplst, 'rb') as tsvfile:
  121 + with open(self.list_file, 'rb') as tsvfile:
125 122 tsvfile = csv.reader(tsvfile, delimiter='\t')
126 123 for line in tsvfile:
127 124 dict_tagbuf[line[0] + '.jpg'] = line[1]
128 125  
129   - timer.report() # 0.009741s
130   - timer.mark()
131 126 try:
132   - with table.batch(batch_size=5000) as b:
  127 + with self.table.batch(batch_size=5000) as b:
133 128 for imgname, imgtag in dict_tagbuf.items():
134   - b.put(imgname, {'cf_tag:class': imgtag})
  129 + b.put(imgname, {'cf_tag:' + feattype: imgtag})
135 130 raise ValueError("Something went wrong!")
136 131 except ValueError:
137 132 pass
138   - timer.report() # 0.509696s
139 133  
140 134  
141   - def get_feat(self, category='hog'):
142   - dir = self.base_dir + self.sub_dir + 'Img/'
143   - maplst = dir + 'images_map_Train.tsv'
  135 + def get_feat(self, feattype='hog'):
144 136  
145 137 dict_tagbuf = {}
146   -
147   - with open(maplst, 'rb') as tsvfile:
  138 + with open(self.list_file, 'rb') as tsvfile:
148 139 tsvfile = csv.reader(tsvfile, delimiter='\t')
149 140 for line in tsvfile:
150 141 dict_tagbuf[line[0] + '.jpg'] = line[1]
151 142  
152 143 dict_featbuf = {}
153   -
154   - timer.mark()
155 144 for imgname, imgtag in dict_tagbuf.items():
156 145 # if imgtag == 'True':
157   - path_img = self.base_dir + self.sub_dir + 'Img/Train/' + imgname[:3] + '/' + imgname[3:]
158   - desc = FeatHOG.feat(path_img, size=(48, 48))
  146 + image = self.img_dir + imgname[:3] + '/' + imgname[3:]
  147 + desc = HOG.FeatHOG.feat(image, size=(48, 48))
159 148 dict_featbuf[imgname] = desc
160   - timer.report() # 4.337425s
161 149  
162   - timer.mark()
163 150 for imgname, desc in dict_featbuf.items():
164 151 # print imgname, desc
165   - dir = self.base_dir + self.sub_dir + 'Feat/Train/' + imgname[:3] + '/'
  152 + dir = self.feat_dir + imgname[:3] + '/'
166 153 if not os.path.exists(dir):
167 154 os.makedirs(dir)
168   - featpath = dir + imgname[3:].split('.')[0] + '.' + category
  155 + featpath = dir + imgname[3:].split('.')[0] + '.' + feattype
169 156 with open(featpath, 'wb') as featfile:
170 157 featfile.write(json.dumps(desc.tolist()))
171 158  
172   - timer.report() # 14.862485s
173 159  
  160 + def store_feat(self, feattype='hog'):
  161 + if self.table == None:
  162 + self.table = self.get_table()
174 163  
175   - def store_feat(self, table):
176   - timer.mark()
177   - dir = self.base_dir + self.sub_dir + 'Feat/'
178 164 dict_featbuf = {}
179   - for path, subdirs, files in os.walk(dir + 'Train/'):
  165 + for path, subdirs, files in os.walk(self.feat_dir):
180 166 for name in files:
181 167 featpath = os.path.join(path, name)
182 168 # print featpath
183 169 with open(featpath, 'rb') as featfile:
184   - imgname = path.split('/')[-1] + name.replace('.hog', '.jpg')
  170 + imgname = path.split('/')[-1] + name.replace('.' + feattype, '.jpg')
185 171 dict_featbuf[imgname] = featfile.read()
186 172  
187   - timer.report() # 0.577940s
188   -
189   - timer.mark()
190 173 try:
191   - with table.batch(batch_size=5000) as b:
  174 + with self.table.batch(batch_size=5000) as b:
192 175 for imgname, featdesc in dict_featbuf.items():
193   - b.put(imgname, {'cf_feat:hog': featdesc})
  176 + b.put(imgname, {'cf_feat:' + feattype: featdesc})
194 177 raise ValueError("Something went wrong!")
195 178 except ValueError:
196 179 pass
197   - timer.report() # 76.075477s
198 180 \ No newline at end of file
  181 +
... ...
mdata/MSR.py
... ... @@ -19,11 +19,15 @@ import happybase
19 19  
20 20  
21 21 class DataMSR(DataDumperBase):
22   - def __init__(self, base_dir='/home/hadoop/data/MSR-IRC2014/', sub_dir='Dev/', data_file='DevSetImage.tsv'):
23   - DataDumperBase.__init__(self)
24   - self.base_dir = base_dir
25   - self.sub_dir = sub_dir
26   - self.data_file = self.base_dir + self.sub_dir + data_file
  22 + def __init__(self, base_dir='/home/hadoop/data/MSR-IRC2014/', category='Dev',
  23 + data_file='DevSetImage.tsv', tag_file='DevSetLabel.tsv'):
  24 + DataDumperBase.__init__(self, base_dir, category)
  25 +
  26 + self.data_file = self.base_dir + self.category + '/' + data_file
  27 + self.tag_file = self.base_dir + self.category + '/' + tag_file
  28 + self.map_file = self.base_dir + self.category + '/' + 'images_map.tsv'
  29 +
  30 + self.table_name = self.base_dir.split('/')[-2] + '-' + self.category
27 31  
28 32 def format(self):
29 33 self.extract()
... ... @@ -44,105 +48,105 @@ class DataMSR(DataDumperBase):
44 48 with open('res/tmp.jpg', 'rb') as f:
45 49 index = md5(f.read()).hexdigest()
46 50  
47   - dir = self.base_dir + self.sub_dir + 'Img/' + index[:3]
  51 + dir = self.img_dir + index[:3] + '/'
48 52 if not os.path.exists(dir):
49 53 os.makedirs(dir)
50   - path = dir + '/' + index[3:] + '.jpg'
51   - print path
  54 + image = dir + index[3:] + '.jpg'
  55 + print image
52 56  
53   - if not os.path.exists(path):
54   - shutil.copy('res/tmp.jpg', path)
  57 + if not os.path.exists(image):
  58 + shutil.copy('res/tmp.jpg', image)
55 59 # or :
56   - # img.save(path, format='JPEG')
  60 + # img.save(image, format='JPEG')
57 61  
58 62  
59 63 def extract(self):
60   - for name, data in self.load_base64():
61   - self.hash_dump(data)
  64 + for name, data in self._load_base64():
  65 + self._hash_dump(data)
62 66  
63 67  
64 68 def build_list(self):
65   - dir = self.base_dir + self.sub_dir
66   - lst = dir + 'Image.lst'
67   - with open(lst, 'wb') as f:
68   - for path, subdirs, files in os.walk(dir):
  69 + assert self.list_file != None
  70 + with open(self.list_file, 'wb') as f:
  71 + for path, subdirs, files in os.walk(self.img_dir):
69 72 for name in files:
70 73 entry = path.split('/')[-1] + '/' + name
71 74 print entry
72 75 f.write(entry + '\n')
73 76  
74 77  
75   - def get_table(self, tablename, connection=None):
76   - if connection is not None:
77   - c = connection
78   - else:
  78 + def get_table(self):
  79 + if self.table != None:
  80 + return self.table
  81 +
  82 + if self.connection is None:
79 83 c = happybase.Connection('HPC-server')
80   - tables = c.tables()
81   - if tablename not in tables:
  84 + self.connection = c
  85 +
  86 + tables = self.connection.tables()
  87 + if self.table_name not in tables:
82 88 families = {'cf_pic': dict(),
83 89 'cf_info': dict(max_versions=10),
84 90 'cf_tag': dict(),
  91 + 'cf_feat': dict(),
85 92 }
86   - c.create_table(name=tablename, families=families)
  93 + self.connection.create_table(name=self.table_name, families=families)
  94 +
  95 + table = self.connection.table(name=self.table_name)
87 96  
88   - tb = c.table(name=tablename)
89   - return tb
  97 + self.table = table
90 98  
  99 + return table
  100 +
  101 +
  102 + def store_image(self):
  103 + if self.table == None:
  104 + self.table = self.get_table()
91 105  
92   - def store_image(self, table):
93   - timer.mark()
94   - dir = self.base_dir + self.sub_dir + 'Img2/'
95   - lst = dir + 'Image.lst'
96 106 dict_buffer = {}
97   - with open(lst, 'rb') as f:
  107 + with open(self.list_file, 'rb') as f:
98 108 for line in f:
99 109 path_img = line.strip('\n')
100 110 if path_img:
101   - with open(dir + 'Dev/' + path_img, 'rb') as fpic:
  111 + with open(self.img_dir + path_img, 'rb') as fpic:
102 112 dict_buffer[path_img.replace('/', '')] = fpic.read()
103   - timer.report() # 1.507566s
104   - timer.mark()
  113 +
105 114 try:
106   - with table.batch(batch_size=5000) as b:
  115 + with self.table.batch(batch_size=5000) as b:
107 116 for imgname, imgdata in dict_buffer.items():
108 117 b.put(imgname, {'cf_pic:data': imgdata})
109 118 raise ValueError("Something went wrong!")
110 119 except ValueError:
111 120 pass
112   - timer.report() # 228.003684s
113 121  
114 122  
115   - def store_tag(self, table):
116   - timer.mark()
117   - dir = self.base_dir + self.sub_dir + 'Img2/'
118   - maplst = dir + 'Image.tsv'
119   - taglist = self.base_dir + self.sub_dir + 'Dev/DevSetLabel.tsv'
  123 + def store_tag(self):
  124 + if self.table == None:
  125 + self.table = self.get_table()
  126 +
120 127 dict_namebuf = {}
121 128 dict_tagbuf = {}
122 129  
123   - with open(maplst, 'rb') as tsvfile:
  130 + with open(self.map_file, 'rb') as tsvfile:
124 131 tsvfile = csv.reader(tsvfile, delimiter='\t')
125 132 for line in tsvfile:
126 133 dict_namebuf[line[0]] = line[2]
127 134  
128   - with open(taglist, 'rb') as tsvfile:
  135 + with open(self.tag_file, 'rb') as tsvfile:
129 136 tsvfile = csv.reader(tsvfile, delimiter='\t')
130 137 for line in tsvfile:
131 138 dict_tagbuf[line[-2]] = (line[:-2], line[-1])
132 139  
133   - timer.report() # 0.148540s
134   - timer.mark()
135 140 try:
136   - with table.batch(batch_size=5000) as b:
  141 + with self.table.batch(batch_size=5000) as b:
137 142 for key, value in dict_tagbuf.items():
138 143 b.put(dict_namebuf[key] + '.jpg', {'cf_tag:' + ''.join(value[0]): value[1]})
139 144 raise ValueError("Something went wrong!")
140 145 except ValueError:
141 146 pass
142   - timer.report() # 3.280105s
143 147  
144   - def get_feat(self, category):
  148 + def get_feat(self, feattype):
145 149 pass
146 150  
147   - def store_feat(self, table, category):
  151 + def store_feat(self, feattype):
148 152 pass
... ...
mdata/MSR.pyc 0 → 100644
No preview for this file type
mdata/__init__.py
  1 +# -*- coding: utf-8 -*-
1 2 __author__ = 'chunk'
2 3  
3   -__all__ = ['DataDumperBase', ]
  4 +__all__ = ['DataDumperBase']
4 5  
5 6  
6 7 class DataDumperBase(object):
... ... @@ -8,17 +9,33 @@ class DataDumperBase(object):
8 9 Base class for image data dumping & retrieving.
9 10 A regular directory pattern would be like this:
10 11  
11   - ├── file-tag-list.tsv
12   - │
13   - ├── Feat
14   - │   ├── 0a1
15   - │   └── 53e
16   - │   └── ...
17   - |
18   - └── Img
19   - ├── 0a1
20   - └── 53e
21   - └── ...
  12 + ├── Dev (category)
  13 + ├── file-tag.tsv (list_file)
  14 + │
  15 + ├── Feat (feat_dir)
  16 + │   ├── 0a1
  17 + │   └── 53e
  18 + │   └── ...
  19 + |
  20 + └── Img (img_dir)
  21 + ├── 0a1
  22 + └── 53e
  23 + └── ...
  24 + ├── Train (category)
  25 + ├── file-tag.tsv (list_file)
  26 + │
  27 + ├── Feat
  28 + │   ├── 032
  29 + │   └── a21
  30 + │   └── ...
  31 + |
  32 + └── Img
  33 + ├── 032
  34 + └── a21
  35 + └── ...
  36 + .
  37 + .
  38 + .
22 39  
23 40 It can be refractored from the original pattern which is supposed to be generated from web crawlers:
24 41  
... ... @@ -32,38 +49,47 @@ class DataDumperBase(object):
32 49 └── ddd.jpg
33 50 └── ...
34 51  
  52 +
35 53 convention:
36 54 'img' for image file data while 'image' for file path;
37 55  
38 56 """
39 57  
40   - def __init__(self):
41   - self.base_dir = None
42   - self.list_file = None
43   - self.dict_data = None
  58 + def __init__(self, base_dir, category):
  59 + """
  60 + base_dir: e.g. '/home/hadoop/data/MSR-IRC2014/'
  61 + list_file: not data_file! e.g. 'file-tag.tsv'
  62 + dict_data: e.g. {'filename':rawdata} or {'filename':tag}
  63 + """
  64 + self.base_dir = base_dir
  65 + self.category = category
  66 + self.dst_dir = self.base_dir + 'dst/' + self.category + '/'
  67 +
  68 + self.list_file = self.dst_dir + 'file-tag.tsv'
  69 + self.feat_dir = self.dst_dir + 'Feat/'
  70 + self.img_dir = self.dst_dir + 'Img/'
44 71  
45   - # self.table_name = None
46   - # self.table = None
47   - # self.connection = None
  72 + self.table_name = None
  73 + self.table = None
  74 + self.connection = None
48 75  
49 76 def format(self):
50 77 pass
51 78  
52 79  
53   -
54   - def get_table(self, tablename, connection=None):
  80 + def get_table(self):
55 81 pass
56 82  
57   - def store_img(self, table):
  83 + def store_img(self):
58 84 pass
59 85  
60   - def store_tag(self, table, category):
  86 + def store_tag(self, feattype):
61 87 pass
62 88  
63   - def get_feat(self, category):
  89 + def get_feat(self, feattype):
64 90 pass
65 91  
66   - def store_feat(self, table, category):
  92 + def store_feat(self, feattype):
67 93 pass
68 94  
69 95  
... ...
mdata/__init__.pyc 0 → 100644
No preview for this file type
mfeat/__init__.py
... ... @@ -7,7 +7,7 @@ import cv2
7 7 from skimage.feature import hog
8 8 from skimage import io, color, transform, exposure
9 9  
10   -__all__ = ['FeatureBase', 'FeatHOG', 'timer']
  10 +__all__ = ['FeatureBase']
11 11  
12 12 timer = ctimer()
13 13  
... ...
mfeat/__init__.pyc 0 → 100644
No preview for this file type
res/tmp.jpg

16.2 KB | W: | H:

8.99 KB | W: | H:

  • 2-up
  • Swipe
  • Onion skin
test_data.py 0 → 100644
... ... @@ -0,0 +1,10 @@
  1 +__author__ = 'chunk'
  2 +
  3 +
  4 +from mdata import MSR
  5 +
  6 +msrd = MSR.DataMSR(base_dir='/media/chunk/Elements/D/data/MSR-IRC2014/',category='Train',data_file='TrainImageSet.tsv', tag_file='TrainSetLabel.tsv')
  7 +# msrd.format()
  8 +# msrd.build_list()
  9 +
  10 +print 'helllo'
0 11 \ No newline at end of file
... ...