staged.

Chunk
1 parent c7fa1d60
Showing 10 changed files with 166 additions and 143 deletions Show diff stats
common.pyc
mdata/CV.py
mdata/MSR.py
mdata/MSR.pyc
mdata/__init__.py
mdata/__init__.pyc
mfeat/__init__.py
mfeat/__init__.pyc
res/tmp.jpg
test_data.py
 __author__ = 'chunk'
  
 from mdata import *
-from mfeat import *
+from mfeat import HOG
  
 import os, sys
 from PIL import Image
@@ -15,11 +15,9 @@ import happybase
  
  
 class DataCV(DataDumperBase):
-    def __init__(self, base_dir='/home/hadoop/data/HeadShoulder/', sub_dir='Train/'):
-        DataDumperBase.__init__(self)
-        self.base_dir = base_dir
-        self.sub_dir = sub_dir
-
+    def __init__(self, base_dir='/home/hadoop/data/HeadShoulder/', category='Train'):
+        DataDumperBase.__init__(self, base_dir, category)
+        self.data_dir = self.base_dir + self.category + '/'
         self.dict_data = {}
  
     def format(self):
@@ -38,20 +36,20 @@ class DataCV(DataDumperBase):
  
         # origion:
         # dir = base_dir + 'Img/Train/' + index[:3]
-        dir = self.base_dir + self.sub_dir + 'Img/' + index[:3]
+        dir = self.img_dir + index[:3] + '/'
         if not os.path.exists(dir):
             os.makedirs(dir)
-        path = dir + '/' + index[3:] + '.jpg'
-        print path
+        image = dir + index[3:] + '.jpg'
+        print image
  
-        if not os.path.exists(path):
-            shutil.copy(image, path)
+        if not os.path.exists(image):
+            shutil.copy(image, image)
         else:
             pass
  
  
     def extract(self):
-        for path, subdirs, files in os.walk(self.base_dir + 'Orig/'):
+        for path, subdirs, files in os.walk(self.data_dir):
             for name in files:
                 imagepath = os.path.join(path, name)
                 print imagepath
@@ -62,136 +60,121 @@ class DataCV(DataDumperBase):
  
         ordict_img = collections.OrderedDict(sorted(self.dict_data.items(), key=lambda d: d[0]))
  
-        lstfile = self.base_dir + self.sub_dir + 'Img/Image.tsv'
-        with open(lstfile, 'w') as f:
+        with open(self.list_file, 'w') as f:
             tsvfile = csv.writer(f, delimiter='\t')
             for key, value in ordict_img.items():
                 tsvfile.writerow([key] + [value])
  
+    def get_table(self):
+        if self.table != None:
+            return self.table
  
-    def get_table(self, tablename, connection=None):
-        if connection is not None:
-            c = connection
-        else:
+        if self.connection is None:
             c = happybase.Connection('HPC-server')
-        tables = c.tables()
-        if tablename not in tables:
+            self.connection = c
+
+        tables = self.connection.tables()
+        if self.table_name not in tables:
             families = {'cf_pic': dict(),
                         'cf_info': dict(max_versions=10),
                         'cf_tag': dict(),
                         'cf_feat': dict(),
             }
-            c.create_table(name=tablename, families=families)
+            self.connection.create_table(name=self.table_name, families=families)
+
+        table = self.connection.table(name=self.table_name)
+
+        self.table = table
  
-        tb = c.table(name=tablename)
-        return tb
+        return table
  
  
-    def store_image(self, table):
-        timer.mark()
-        dir = self.base_dir + self.sub_dir + 'Img/'
-        maplst = dir + 'Image.tsv'
+    def store_image(self):
+        if self.table == None:
+            self.table = self.get_table()
  
         dict_databuf = {}
  
-        with open(maplst, 'rb') as tsvfile:
+        with open(self.list_file, 'rb') as tsvfile:
             tsvfile = csv.reader(tsvfile, delimiter='\t')
             for line in tsvfile:
-                path_img = self.base_dir + self.sub_dir + 'Img/Train/' + line[0][:3] + '/' + line[0][3:] + '.jpg'
+                path_img = self.img_dir + + line[0][:3] + '/' + line[0][3:] + '.jpg'
                 if path_img:
                     with open(path_img, 'rb') as fpic:
                         dict_databuf[line[0] + '.jpg'] = fpic.read()
  
-        timer.report()  # 58.761801s
-        timer.mark()
         try:
-            with table.batch(batch_size=5000) as b:
+            with self.table.batch(batch_size=5000) as b:
                 for imgname, imgdata in dict_databuf.items():
                     b.put(imgname, {'cf_pic:data': imgdata})
                 raise ValueError("Something went wrong!")
         except ValueError:
             pass
-        timer.report()  # 15.570524s
  
  
-    def store_tag(self, table):
-        timer.mark()
-        dir = self.base_dir + self.sub_dir + 'Img/'
-        maplst = dir + 'Image.tsv'
+    def store_tag(self, feattype='hog'):
+        if self.table == None:
+            self.table = self.get_table()
  
         dict_tagbuf = {}
  
-        with open(maplst, 'rb') as tsvfile:
+        with open(self.list_file, 'rb') as tsvfile:
             tsvfile = csv.reader(tsvfile, delimiter='\t')
             for line in tsvfile:
                 dict_tagbuf[line[0] + '.jpg'] = line[1]
  
-        timer.report()  # 0.009741s
-        timer.mark()
         try:
-            with table.batch(batch_size=5000) as b:
+            with self.table.batch(batch_size=5000) as b:
                 for imgname, imgtag in dict_tagbuf.items():
-                    b.put(imgname, {'cf_tag:class': imgtag})
+                    b.put(imgname, {'cf_tag:' + feattype: imgtag})
                 raise ValueError("Something went wrong!")
         except ValueError:
             pass
-        timer.report()  # 0.509696s
  
  
-    def get_feat(self, category='hog'):
-        dir = self.base_dir + self.sub_dir + 'Img/'
-        maplst = dir + 'images_map_Train.tsv'
+    def get_feat(self, feattype='hog'):
  
         dict_tagbuf = {}
-
-        with open(maplst, 'rb') as tsvfile:
+        with open(self.list_file, 'rb') as tsvfile:
             tsvfile = csv.reader(tsvfile, delimiter='\t')
             for line in tsvfile:
                 dict_tagbuf[line[0] + '.jpg'] = line[1]
  
         dict_featbuf = {}
-
-        timer.mark()
         for imgname, imgtag in dict_tagbuf.items():
             # if imgtag == 'True':
-            path_img = self.base_dir + self.sub_dir + 'Img/Train/' + imgname[:3] + '/' + imgname[3:]
-            desc = FeatHOG.feat(path_img, size=(48, 48))
+            image = self.img_dir + imgname[:3] + '/' + imgname[3:]
+            desc = HOG.FeatHOG.feat(image, size=(48, 48))
             dict_featbuf[imgname] = desc
-        timer.report()  # 4.337425s
  
-        timer.mark()
         for imgname, desc in dict_featbuf.items():
             # print imgname, desc
-            dir = self.base_dir + self.sub_dir + 'Feat/Train/' + imgname[:3] + '/'
+            dir = self.feat_dir + imgname[:3] + '/'
             if not os.path.exists(dir):
                 os.makedirs(dir)
-            featpath = dir + imgname[3:].split('.')[0] + '.' + category
+            featpath = dir + imgname[3:].split('.')[0] + '.' + feattype
             with open(featpath, 'wb') as featfile:
                 featfile.write(json.dumps(desc.tolist()))
  
-        timer.report()  # 14.862485s
  
+    def store_feat(self, feattype='hog'):
+        if self.table == None:
+            self.table = self.get_table()
  
-    def store_feat(self, table):
-        timer.mark()
-        dir = self.base_dir + self.sub_dir + 'Feat/'
         dict_featbuf = {}
-        for path, subdirs, files in os.walk(dir + 'Train/'):
+        for path, subdirs, files in os.walk(self.feat_dir):
             for name in files:
                 featpath = os.path.join(path, name)
                 # print featpath
                 with open(featpath, 'rb') as featfile:
-                    imgname = path.split('/')[-1] + name.replace('.hog', '.jpg')
+                    imgname = path.split('/')[-1] + name.replace('.' + feattype, '.jpg')
                     dict_featbuf[imgname] = featfile.read()
  
-        timer.report()  # 0.577940s
-
-        timer.mark()
         try:
-            with table.batch(batch_size=5000) as b:
+            with self.table.batch(batch_size=5000) as b:
                 for imgname, featdesc in dict_featbuf.items():
-                    b.put(imgname, {'cf_feat:hog': featdesc})
+                    b.put(imgname, {'cf_feat:' + feattype: featdesc})
                 raise ValueError("Something went wrong!")
         except ValueError:
             pass
-        timer.report()  # 76.075477s
 \ No newline at end of file
+
@@ -19,11 +19,15 @@ import happybase
  
  
 class DataMSR(DataDumperBase):
-    def __init__(self, base_dir='/home/hadoop/data/MSR-IRC2014/', sub_dir='Dev/', data_file='DevSetImage.tsv'):
-        DataDumperBase.__init__(self)
-        self.base_dir = base_dir
-        self.sub_dir = sub_dir
-        self.data_file = self.base_dir + self.sub_dir + data_file
+    def __init__(self, base_dir='/home/hadoop/data/MSR-IRC2014/', category='Dev',
+                 data_file='DevSetImage.tsv', tag_file='DevSetLabel.tsv'):
+        DataDumperBase.__init__(self, base_dir, category)
+
+        self.data_file = self.base_dir + self.category + '/' + data_file
+        self.tag_file = self.base_dir + self.category + '/' + tag_file
+        self.map_file = self.base_dir + self.category + '/' + 'images_map.tsv'
+
+        self.table_name = self.base_dir.split('/')[-2] + '-' + self.category
  
     def format(self):
         self.extract()
@@ -44,105 +48,105 @@ class DataMSR(DataDumperBase):
         with open('res/tmp.jpg', 'rb') as f:
             index = md5(f.read()).hexdigest()
  
-        dir = self.base_dir + self.sub_dir + 'Img/' + index[:3]
+        dir = self.img_dir + index[:3] + '/'
         if not os.path.exists(dir):
             os.makedirs(dir)
-        path = dir + '/' + index[3:] + '.jpg'
-        print path
+        image = dir + index[3:] + '.jpg'
+        print image
  
-        if not os.path.exists(path):
-            shutil.copy('res/tmp.jpg', path)
+        if not os.path.exists(image):
+            shutil.copy('res/tmp.jpg', image)
             # or :
-            # img.save(path, format='JPEG')
+            # img.save(image, format='JPEG')
  
  
     def extract(self):
-        for name, data in self.load_base64():
-            self.hash_dump(data)
+        for name, data in self._load_base64():
+            self._hash_dump(data)
  
  
     def build_list(self):
-        dir = self.base_dir + self.sub_dir
-        lst = dir + 'Image.lst'
-        with open(lst, 'wb') as f:
-            for path, subdirs, files in os.walk(dir):
+        assert self.list_file != None
+        with open(self.list_file, 'wb') as f:
+            for path, subdirs, files in os.walk(self.img_dir):
                 for name in files:
                     entry = path.split('/')[-1] + '/' + name
                     print entry
                     f.write(entry + '\n')
  
  
-    def get_table(self, tablename, connection=None):
-        if connection is not None:
-            c = connection
-        else:
+    def get_table(self):
+        if self.table != None:
+            return self.table
+
+        if self.connection is None:
             c = happybase.Connection('HPC-server')
-        tables = c.tables()
-        if tablename not in tables:
+            self.connection = c
+
+        tables = self.connection.tables()
+        if self.table_name not in tables:
             families = {'cf_pic': dict(),
                         'cf_info': dict(max_versions=10),
                         'cf_tag': dict(),
+                        'cf_feat': dict(),
             }
-            c.create_table(name=tablename, families=families)
+            self.connection.create_table(name=self.table_name, families=families)
+
+        table = self.connection.table(name=self.table_name)
  
-        tb = c.table(name=tablename)
-        return tb
+        self.table = table
  
+        return table
+
+
+    def store_image(self):
+        if self.table == None:
+            self.table = self.get_table()
  
-    def store_image(self, table):
-        timer.mark()
-        dir = self.base_dir + self.sub_dir + 'Img2/'
-        lst = dir + 'Image.lst'
         dict_buffer = {}
-        with open(lst, 'rb') as f:
+        with open(self.list_file, 'rb') as f:
             for line in f:
                 path_img = line.strip('\n')
                 if path_img:
-                    with open(dir + 'Dev/' + path_img, 'rb') as fpic:
+                    with open(self.img_dir + path_img, 'rb') as fpic:
                         dict_buffer[path_img.replace('/', '')] = fpic.read()
-        timer.report()  # 1.507566s
-        timer.mark()
+
         try:
-            with table.batch(batch_size=5000) as b:
+            with self.table.batch(batch_size=5000) as b:
                 for imgname, imgdata in dict_buffer.items():
                     b.put(imgname, {'cf_pic:data': imgdata})
                 raise ValueError("Something went wrong!")
         except ValueError:
             pass
-        timer.report()  # 228.003684s
  
  
-    def store_tag(self, table):
-        timer.mark()
-        dir = self.base_dir + self.sub_dir + 'Img2/'
-        maplst = dir + 'Image.tsv'
-        taglist = self.base_dir + self.sub_dir + 'Dev/DevSetLabel.tsv'
+    def store_tag(self):
+        if self.table == None:
+            self.table = self.get_table()
+
         dict_namebuf = {}
         dict_tagbuf = {}
  
-        with open(maplst, 'rb') as tsvfile:
+        with open(self.map_file, 'rb') as tsvfile:
             tsvfile = csv.reader(tsvfile, delimiter='\t')
             for line in tsvfile:
                 dict_namebuf[line[0]] = line[2]
  
-        with open(taglist, 'rb') as tsvfile:
+        with open(self.tag_file, 'rb') as tsvfile:
             tsvfile = csv.reader(tsvfile, delimiter='\t')
             for line in tsvfile:
                 dict_tagbuf[line[-2]] = (line[:-2], line[-1])
  
-        timer.report()  # 0.148540s
-        timer.mark()
         try:
-            with table.batch(batch_size=5000) as b:
+            with self.table.batch(batch_size=5000) as b:
                 for key, value in dict_tagbuf.items():
                     b.put(dict_namebuf[key] + '.jpg', {'cf_tag:' + ''.join(value[0]): value[1]})
                 raise ValueError("Something went wrong!")
         except ValueError:
             pass
-        timer.report()  # 3.280105s
  
-    def get_feat(self, category):
+    def get_feat(self, feattype):
         pass
  
-    def store_feat(self, table, category):
+    def store_feat(self, feattype):
         pass
+# -*- coding: utf-8 -*-
 __author__ = 'chunk'
  
-__all__ = ['DataDumperBase', ]
+__all__ = ['DataDumperBase']
  
  
 class DataDumperBase(object):
@@ -8,17 +9,33 @@ class DataDumperBase(object):
     Base class for image data dumping & retrieving.
     A regular directory pattern would be like this:
  
-        ├── file-tag-list.tsv
-        │
-        ├── Feat
-        │   ├── 0a1
-        │   └── 53e
-        │   └── ...
-        |
-        └── Img
-            ├── 0a1
-            └── 53e
-            └── ...
+        ├── Dev (category)
+            ├── file-tag.tsv (list_file)
+            │
+            ├── Feat (feat_dir)
+            │   ├── 0a1
+            │   └── 53e
+            │   └── ...
+            |
+            └── Img (img_dir)
+                ├── 0a1
+                └── 53e
+                └── ...
+        ├── Train (category)
+            ├── file-tag.tsv (list_file)
+            │
+            ├── Feat
+            │   ├── 032
+            │   └── a21
+            │   └── ...
+            |
+            └── Img
+                ├── 032
+                └── a21
+                └── ...
+        .
+        .
+        .
  
     It can be refractored from the original pattern which is supposed to be generated from web crawlers:
  
@@ -32,38 +49,47 @@ class DataDumperBase(object):
             └── ddd.jpg
             └── ...
  
+
     convention:
         'img' for image file data while 'image' for file path;
  
     """
  
-    def __init__(self):
-        self.base_dir = None
-        self.list_file = None
-        self.dict_data = None
+    def __init__(self, base_dir, category):
+        """
+        base_dir: e.g. '/home/hadoop/data/MSR-IRC2014/'
+        list_file: not data_file! e.g. 'file-tag.tsv'
+        dict_data: e.g. {'filename':rawdata} or {'filename':tag}
+        """
+        self.base_dir = base_dir
+        self.category = category
+        self.dst_dir = self.base_dir + 'dst/' + self.category + '/'
+
+        self.list_file = self.dst_dir + 'file-tag.tsv'
+        self.feat_dir = self.dst_dir + 'Feat/'
+        self.img_dir = self.dst_dir + 'Img/'
  
-        # self.table_name = None
-        # self.table = None
-        # self.connection = None
+        self.table_name = None
+        self.table = None
+        self.connection = None
  
     def format(self):
         pass
  
  
-
-    def get_table(self, tablename, connection=None):
+    def get_table(self):
         pass
  
-    def store_img(self, table):
+    def store_img(self):
         pass
  
-    def store_tag(self, table, category):
+    def store_tag(self, feattype):
         pass
  
-    def get_feat(self, category):
+    def get_feat(self, feattype):
         pass
  
-    def store_feat(self, table, category):
+    def store_feat(self, feattype):
         pass
  
  
@@ -7,7 +7,7 @@ import cv2
 from skimage.feature import hog
 from skimage import io, color, transform, exposure
  
-__all__ = ['FeatureBase', 'FeatHOG', 'timer']
+__all__ = ['FeatureBase']
  
 timer = ctimer()
  
@@ -0,0 +1,10 @@
+__author__ = 'chunk'
+
+
+from mdata import MSR
+
+msrd = MSR.DataMSR(base_dir='/media/chunk/Elements/D/data/MSR-IRC2014/',category='Train',data_file='TrainImageSet.tsv', tag_file='TrainSetLabel.tsv')
+# msrd.format()
+# msrd.build_list()
+
+print 'helllo'
 \ No newline at end of file
1	1	__author__ = 'chunk'
2	2
3	3	from mdata import *
4		-from mfeat import *
	4	+from mfeat import HOG
5	5
6	6	import os, sys
7	7	from PIL import Image
...	...	@@ -15,11 +15,9 @@ import happybase
15	15
16	16
17	17	class DataCV(DataDumperBase):
18		- def __init__(self, base_dir='/home/hadoop/data/HeadShoulder/', sub_dir='Train/'):
19		- DataDumperBase.__init__(self)
20		- self.base_dir = base_dir
21		- self.sub_dir = sub_dir
22		-
	18	+ def __init__(self, base_dir='/home/hadoop/data/HeadShoulder/', category='Train'):
	19	+ DataDumperBase.__init__(self, base_dir, category)
	20	+ self.data_dir = self.base_dir + self.category + '/'
23	21	self.dict_data = {}
24	22
25	23	def format(self):
...	...	@@ -38,20 +36,20 @@ class DataCV(DataDumperBase):
38	36
39	37	# origion:
40	38	# dir = base_dir + 'Img/Train/' + index[:3]
41		- dir = self.base_dir + self.sub_dir + 'Img/' + index[:3]
	39	+ dir = self.img_dir + index[:3] + '/'
42	40	if not os.path.exists(dir):
43	41	os.makedirs(dir)
44		- path = dir + '/' + index[3:] + '.jpg'
45		- print path
	42	+ image = dir + index[3:] + '.jpg'
	43	+ print image
46	44
47		- if not os.path.exists(path):
48		- shutil.copy(image, path)
	45	+ if not os.path.exists(image):
	46	+ shutil.copy(image, image)
49	47	else:
50	48	pass
51	49
52	50
53	51	def extract(self):
54		- for path, subdirs, files in os.walk(self.base_dir + 'Orig/'):
	52	+ for path, subdirs, files in os.walk(self.data_dir):
55	53	for name in files:
56	54	imagepath = os.path.join(path, name)
57	55	print imagepath
...	...	@@ -62,136 +60,121 @@ class DataCV(DataDumperBase):
62	60
63	61	ordict_img = collections.OrderedDict(sorted(self.dict_data.items(), key=lambda d: d[0]))
64	62
65		- lstfile = self.base_dir + self.sub_dir + 'Img/Image.tsv'
66		- with open(lstfile, 'w') as f:
	63	+ with open(self.list_file, 'w') as f:
67	64	tsvfile = csv.writer(f, delimiter='\t')
68	65	for key, value in ordict_img.items():
69	66	tsvfile.writerow([key] + [value])
70	67
	68	+ def get_table(self):
	69	+ if self.table != None:
	70	+ return self.table
71	71
72		- def get_table(self, tablename, connection=None):
73		- if connection is not None:
74		- c = connection
75		- else:
	72	+ if self.connection is None:
76	73	c = happybase.Connection('HPC-server')
77		- tables = c.tables()
78		- if tablename not in tables:
	74	+ self.connection = c
	75	+
	76	+ tables = self.connection.tables()
	77	+ if self.table_name not in tables:
79	78	families = {'cf_pic': dict(),
80	79	'cf_info': dict(max_versions=10),
81	80	'cf_tag': dict(),
82	81	'cf_feat': dict(),
83	82	}
84		- c.create_table(name=tablename, families=families)
	83	+ self.connection.create_table(name=self.table_name, families=families)
	84	+
	85	+ table = self.connection.table(name=self.table_name)
	86	+
	87	+ self.table = table
85	88
86		- tb = c.table(name=tablename)
87		- return tb
	89	+ return table
88	90
89	91
90		- def store_image(self, table):
91		- timer.mark()
92		- dir = self.base_dir + self.sub_dir + 'Img/'
93		- maplst = dir + 'Image.tsv'
	92	+ def store_image(self):
	93	+ if self.table == None:
	94	+ self.table = self.get_table()
94	95
95	96	dict_databuf = {}
96	97
97		- with open(maplst, 'rb') as tsvfile:
	98	+ with open(self.list_file, 'rb') as tsvfile:
98	99	tsvfile = csv.reader(tsvfile, delimiter='\t')
99	100	for line in tsvfile:
100		- path_img = self.base_dir + self.sub_dir + 'Img/Train/' + line[0][:3] + '/' + line[0][3:] + '.jpg'
	101	+ path_img = self.img_dir + + line[0][:3] + '/' + line[0][3:] + '.jpg'
101	102	if path_img:
102	103	with open(path_img, 'rb') as fpic:
103	104	dict_databuf[line[0] + '.jpg'] = fpic.read()
104	105
105		- timer.report() # 58.761801s
106		- timer.mark()
107	106	try:
108		- with table.batch(batch_size=5000) as b:
	107	+ with self.table.batch(batch_size=5000) as b:
109	108	for imgname, imgdata in dict_databuf.items():
110	109	b.put(imgname, {'cf_pic:data': imgdata})
111	110	raise ValueError("Something went wrong!")
112	111	except ValueError:
113	112	pass
114		- timer.report() # 15.570524s
115	113
116	114
117		- def store_tag(self, table):
118		- timer.mark()
119		- dir = self.base_dir + self.sub_dir + 'Img/'
120		- maplst = dir + 'Image.tsv'
	115	+ def store_tag(self, feattype='hog'):
	116	+ if self.table == None:
	117	+ self.table = self.get_table()
121	118
122	119	dict_tagbuf = {}
123	120
124		- with open(maplst, 'rb') as tsvfile:
	121	+ with open(self.list_file, 'rb') as tsvfile:
125	122	tsvfile = csv.reader(tsvfile, delimiter='\t')
126	123	for line in tsvfile:
127	124	dict_tagbuf[line[0] + '.jpg'] = line[1]
128	125
129		- timer.report() # 0.009741s
130		- timer.mark()
131	126	try:
132		- with table.batch(batch_size=5000) as b:
	127	+ with self.table.batch(batch_size=5000) as b:
133	128	for imgname, imgtag in dict_tagbuf.items():
134		- b.put(imgname, {'cf_tag:class': imgtag})
	129	+ b.put(imgname, {'cf_tag:' + feattype: imgtag})
135	130	raise ValueError("Something went wrong!")
136	131	except ValueError:
137	132	pass
138		- timer.report() # 0.509696s
139	133
140	134
141		- def get_feat(self, category='hog'):
142		- dir = self.base_dir + self.sub_dir + 'Img/'
143		- maplst = dir + 'images_map_Train.tsv'
	135	+ def get_feat(self, feattype='hog'):
144	136
145	137	dict_tagbuf = {}
146		-
147		- with open(maplst, 'rb') as tsvfile:
	138	+ with open(self.list_file, 'rb') as tsvfile:
148	139	tsvfile = csv.reader(tsvfile, delimiter='\t')
149	140	for line in tsvfile:
150	141	dict_tagbuf[line[0] + '.jpg'] = line[1]
151	142
152	143	dict_featbuf = {}
153		-
154		- timer.mark()
155	144	for imgname, imgtag in dict_tagbuf.items():
156	145	# if imgtag == 'True':
157		- path_img = self.base_dir + self.sub_dir + 'Img/Train/' + imgname[:3] + '/' + imgname[3:]
158		- desc = FeatHOG.feat(path_img, size=(48, 48))
	146	+ image = self.img_dir + imgname[:3] + '/' + imgname[3:]
	147	+ desc = HOG.FeatHOG.feat(image, size=(48, 48))
159	148	dict_featbuf[imgname] = desc
160		- timer.report() # 4.337425s
161	149
162		- timer.mark()
163	150	for imgname, desc in dict_featbuf.items():
164	151	# print imgname, desc
165		- dir = self.base_dir + self.sub_dir + 'Feat/Train/' + imgname[:3] + '/'
	152	+ dir = self.feat_dir + imgname[:3] + '/'
166	153	if not os.path.exists(dir):
167	154	os.makedirs(dir)
168		- featpath = dir + imgname[3:].split('.')[0] + '.' + category
	155	+ featpath = dir + imgname[3:].split('.')[0] + '.' + feattype
169	156	with open(featpath, 'wb') as featfile:
170	157	featfile.write(json.dumps(desc.tolist()))
171	158
172		- timer.report() # 14.862485s
173	159
	160	+ def store_feat(self, feattype='hog'):
	161	+ if self.table == None:
	162	+ self.table = self.get_table()
174	163
175		- def store_feat(self, table):
176		- timer.mark()
177		- dir = self.base_dir + self.sub_dir + 'Feat/'
178	164	dict_featbuf = {}
179		- for path, subdirs, files in os.walk(dir + 'Train/'):
	165	+ for path, subdirs, files in os.walk(self.feat_dir):
180	166	for name in files:
181	167	featpath = os.path.join(path, name)
182	168	# print featpath
183	169	with open(featpath, 'rb') as featfile:
184		- imgname = path.split('/')[-1] + name.replace('.hog', '.jpg')
	170	+ imgname = path.split('/')[-1] + name.replace('.' + feattype, '.jpg')
185	171	dict_featbuf[imgname] = featfile.read()
186	172
187		- timer.report() # 0.577940s
188		-
189		- timer.mark()
190	173	try:
191		- with table.batch(batch_size=5000) as b:
	174	+ with self.table.batch(batch_size=5000) as b:
192	175	for imgname, featdesc in dict_featbuf.items():
193		- b.put(imgname, {'cf_feat:hog': featdesc})
	176	+ b.put(imgname, {'cf_feat:' + feattype: featdesc})
194	177	raise ValueError("Something went wrong!")
195	178	except ValueError:
196	179	pass
197		- timer.report() # 76.075477s
198	180	\ No newline at end of file
	181	+
...	...
...	...	@@ -19,11 +19,15 @@ import happybase
19	19
20	20
21	21	class DataMSR(DataDumperBase):
22		- def __init__(self, base_dir='/home/hadoop/data/MSR-IRC2014/', sub_dir='Dev/', data_file='DevSetImage.tsv'):
23		- DataDumperBase.__init__(self)
24		- self.base_dir = base_dir
25		- self.sub_dir = sub_dir
26		- self.data_file = self.base_dir + self.sub_dir + data_file
	22	+ def __init__(self, base_dir='/home/hadoop/data/MSR-IRC2014/', category='Dev',
	23	+ data_file='DevSetImage.tsv', tag_file='DevSetLabel.tsv'):
	24	+ DataDumperBase.__init__(self, base_dir, category)
	25	+
	26	+ self.data_file = self.base_dir + self.category + '/' + data_file
	27	+ self.tag_file = self.base_dir + self.category + '/' + tag_file
	28	+ self.map_file = self.base_dir + self.category + '/' + 'images_map.tsv'
	29	+
	30	+ self.table_name = self.base_dir.split('/')[-2] + '-' + self.category
27	31
28	32	def format(self):
29	33	self.extract()
...	...	@@ -44,105 +48,105 @@ class DataMSR(DataDumperBase):
44	48	with open('res/tmp.jpg', 'rb') as f:
45	49	index = md5(f.read()).hexdigest()
46	50
47		- dir = self.base_dir + self.sub_dir + 'Img/' + index[:3]
	51	+ dir = self.img_dir + index[:3] + '/'
48	52	if not os.path.exists(dir):
49	53	os.makedirs(dir)
50		- path = dir + '/' + index[3:] + '.jpg'
51		- print path
	54	+ image = dir + index[3:] + '.jpg'
	55	+ print image
52	56
53		- if not os.path.exists(path):
54		- shutil.copy('res/tmp.jpg', path)
	57	+ if not os.path.exists(image):
	58	+ shutil.copy('res/tmp.jpg', image)
55	59	# or :
56		- # img.save(path, format='JPEG')
	60	+ # img.save(image, format='JPEG')
57	61
58	62
59	63	def extract(self):
60		- for name, data in self.load_base64():
61		- self.hash_dump(data)
	64	+ for name, data in self._load_base64():
	65	+ self._hash_dump(data)
62	66
63	67
64	68	def build_list(self):
65		- dir = self.base_dir + self.sub_dir
66		- lst = dir + 'Image.lst'
67		- with open(lst, 'wb') as f:
68		- for path, subdirs, files in os.walk(dir):
	69	+ assert self.list_file != None
	70	+ with open(self.list_file, 'wb') as f:
	71	+ for path, subdirs, files in os.walk(self.img_dir):
69	72	for name in files:
70	73	entry = path.split('/')[-1] + '/' + name
71	74	print entry
72	75	f.write(entry + '\n')
73	76
74	77
75		- def get_table(self, tablename, connection=None):
76		- if connection is not None:
77		- c = connection
78		- else:
	78	+ def get_table(self):
	79	+ if self.table != None:
	80	+ return self.table
	81	+
	82	+ if self.connection is None:
79	83	c = happybase.Connection('HPC-server')
80		- tables = c.tables()
81		- if tablename not in tables:
	84	+ self.connection = c
	85	+
	86	+ tables = self.connection.tables()
	87	+ if self.table_name not in tables:
82	88	families = {'cf_pic': dict(),
83	89	'cf_info': dict(max_versions=10),
84	90	'cf_tag': dict(),
	91	+ 'cf_feat': dict(),
85	92	}
86		- c.create_table(name=tablename, families=families)
	93	+ self.connection.create_table(name=self.table_name, families=families)
	94	+
	95	+ table = self.connection.table(name=self.table_name)
87	96
88		- tb = c.table(name=tablename)
89		- return tb
	97	+ self.table = table
90	98
	99	+ return table
	100	+
	101	+
	102	+ def store_image(self):
	103	+ if self.table == None:
	104	+ self.table = self.get_table()
91	105
92		- def store_image(self, table):
93		- timer.mark()
94		- dir = self.base_dir + self.sub_dir + 'Img2/'
95		- lst = dir + 'Image.lst'
96	106	dict_buffer = {}
97		- with open(lst, 'rb') as f:
	107	+ with open(self.list_file, 'rb') as f:
98	108	for line in f:
99	109	path_img = line.strip('\n')
100	110	if path_img:
101		- with open(dir + 'Dev/' + path_img, 'rb') as fpic:
	111	+ with open(self.img_dir + path_img, 'rb') as fpic:
102	112	dict_buffer[path_img.replace('/', '')] = fpic.read()
103		- timer.report() # 1.507566s
104		- timer.mark()
	113	+
105	114	try:
106		- with table.batch(batch_size=5000) as b:
	115	+ with self.table.batch(batch_size=5000) as b:
107	116	for imgname, imgdata in dict_buffer.items():
108	117	b.put(imgname, {'cf_pic:data': imgdata})
109	118	raise ValueError("Something went wrong!")
110	119	except ValueError:
111	120	pass
112		- timer.report() # 228.003684s
113	121
114	122
115		- def store_tag(self, table):
116		- timer.mark()
117		- dir = self.base_dir + self.sub_dir + 'Img2/'
118		- maplst = dir + 'Image.tsv'
119		- taglist = self.base_dir + self.sub_dir + 'Dev/DevSetLabel.tsv'
	123	+ def store_tag(self):
	124	+ if self.table == None:
	125	+ self.table = self.get_table()
	126	+
120	127	dict_namebuf = {}
121	128	dict_tagbuf = {}
122	129
123		- with open(maplst, 'rb') as tsvfile:
	130	+ with open(self.map_file, 'rb') as tsvfile:
124	131	tsvfile = csv.reader(tsvfile, delimiter='\t')
125	132	for line in tsvfile:
126	133	dict_namebuf[line[0]] = line[2]
127	134
128		- with open(taglist, 'rb') as tsvfile:
	135	+ with open(self.tag_file, 'rb') as tsvfile:
129	136	tsvfile = csv.reader(tsvfile, delimiter='\t')
130	137	for line in tsvfile:
131	138	dict_tagbuf[line[-2]] = (line[:-2], line[-1])
132	139
133		- timer.report() # 0.148540s
134		- timer.mark()
135	140	try:
136		- with table.batch(batch_size=5000) as b:
	141	+ with self.table.batch(batch_size=5000) as b:
137	142	for key, value in dict_tagbuf.items():
138	143	b.put(dict_namebuf[key] + '.jpg', {'cf_tag:' + ''.join(value[0]): value[1]})
139	144	raise ValueError("Something went wrong!")
140	145	except ValueError:
141	146	pass
142		- timer.report() # 3.280105s
143	147
144		- def get_feat(self, category):
	148	+ def get_feat(self, feattype):
145	149	pass
146	150
147		- def store_feat(self, table, category):
	151	+ def store_feat(self, feattype):
148	152	pass
...	...
	1	+# -- coding: utf-8 --
1	2	__author__ = 'chunk'
2	3
3		-__all__ = ['DataDumperBase', ]
	4	+__all__ = ['DataDumperBase']
4	5
5	6
6	7	class DataDumperBase(object):
...	...	@@ -8,17 +9,33 @@ class DataDumperBase(object):
8	9	Base class for image data dumping & retrieving.
9	10	A regular directory pattern would be like this:
10	11
11		- ├── file-tag-list.tsv
12		- │
13		- ├── Feat
14		- │ ├── 0a1
15		- │ └── 53e
16		- │ └── ...
17		- \|
18		- └── Img
19		- ├── 0a1
20		- └── 53e
21		- └── ...
	12	+ ├── Dev (category)
	13	+ ├── file-tag.tsv (list_file)
	14	+ │
	15	+ ├── Feat (feat_dir)
	16	+ │ ├── 0a1
	17	+ │ └── 53e
	18	+ │ └── ...
	19	+ \|
	20	+ └── Img (img_dir)
	21	+ ├── 0a1
	22	+ └── 53e
	23	+ └── ...
	24	+ ├── Train (category)
	25	+ ├── file-tag.tsv (list_file)
	26	+ │
	27	+ ├── Feat
	28	+ │ ├── 032
	29	+ │ └── a21
	30	+ │ └── ...
	31	+ \|
	32	+ └── Img
	33	+ ├── 032
	34	+ └── a21
	35	+ └── ...
	36	+ .
	37	+ .
	38	+ .
22	39
23	40	It can be refractored from the original pattern which is supposed to be generated from web crawlers:
24	41
...	...	@@ -32,38 +49,47 @@ class DataDumperBase(object):
32	49	└── ddd.jpg
33	50	└── ...
34	51
	52	+
35	53	convention:
36	54	'img' for image file data while 'image' for file path;
37	55
38	56	"""
39	57
40		- def __init__(self):
41		- self.base_dir = None
42		- self.list_file = None
43		- self.dict_data = None
	58	+ def __init__(self, base_dir, category):
	59	+ """
	60	+ base_dir: e.g. '/home/hadoop/data/MSR-IRC2014/'
	61	+ list_file: not data_file! e.g. 'file-tag.tsv'
	62	+ dict_data: e.g. {'filename':rawdata} or {'filename':tag}
	63	+ """
	64	+ self.base_dir = base_dir
	65	+ self.category = category
	66	+ self.dst_dir = self.base_dir + 'dst/' + self.category + '/'
	67	+
	68	+ self.list_file = self.dst_dir + 'file-tag.tsv'
	69	+ self.feat_dir = self.dst_dir + 'Feat/'
	70	+ self.img_dir = self.dst_dir + 'Img/'
44	71
45		- # self.table_name = None
46		- # self.table = None
47		- # self.connection = None
	72	+ self.table_name = None
	73	+ self.table = None
	74	+ self.connection = None
48	75
49	76	def format(self):
50	77	pass
51	78
52	79
53		-
54		- def get_table(self, tablename, connection=None):
	80	+ def get_table(self):
55	81	pass
56	82
57		- def store_img(self, table):
	83	+ def store_img(self):
58	84	pass
59	85
60		- def store_tag(self, table, category):
	86	+ def store_tag(self, feattype):
61	87	pass
62	88
63		- def get_feat(self, category):
	89	+ def get_feat(self, feattype):
64	90	pass
65	91
66		- def store_feat(self, table, category):
	92	+ def store_feat(self, feattype):
67	93	pass
68	94
69	95
...	...
...	...	@@ -7,7 +7,7 @@ import cv2
7	7	from skimage.feature import hog
8	8	from skimage import io, color, transform, exposure
9	9
10		-__all__ = ['FeatureBase', 'FeatHOG', 'timer']
	10	+__all__ = ['FeatureBase']
11	11
12	12	timer = ctimer()
13	13
...	...
...	...	@@ -0,0 +1,10 @@
	1	+__author__ = 'chunk'
	2	+
	3	+
	4	+from mdata import MSR
	5	+
	6	+msrd = MSR.DataMSR(base_dir='/media/chunk/Elements/D/data/MSR-IRC2014/',category='Train',data_file='TrainImageSet.tsv', tag_file='TrainSetLabel.tsv')
	7	+# msrd.format()
	8	+# msrd.build_list()
	9	+
	10	+print 'helllo'
0	11	\ No newline at end of file
...	...