staged.

Chunk
1 parent c7fa1d60
Showing 10 changed files with 166 additions and 143 deletions Show diff stats
common.pyc
mdata/CV.py
mdata/MSR.py
mdata/MSR.pyc
mdata/__init__.py
mdata/__init__.pyc
mfeat/__init__.py
mfeat/__init__.pyc
res/tmp.jpg
test_data.py
 __author__ = 'chunk'
 from mdata import *
-from mfeat import *
+from mfeat import HOG
 import os, sys
 from PIL import Image
@@ -15,11 +15,9 @@ import happybase
 class DataCV(DataDumperBase):
-    def __init__(self, base_dir='/home/hadoop/data/HeadShoulder/', sub_dir='Train/'):
-        DataDumperBase.__init__(self)
-        self.base_dir = base_dir
-        self.sub_dir = sub_dir
-
+    def __init__(self, base_dir='/home/hadoop/data/HeadShoulder/', category='Train'):
+        DataDumperBase.__init__(self, base_dir, category)
+        self.data_dir = self.base_dir + self.category + '/'
         self.dict_data = {}
     def format(self):
@@ -38,20 +36,20 @@ class DataCV(DataDumperBase):
         # origion:
         # dir = base_dir + 'Img/Train/' + index[:3]
-        dir = self.base_dir + self.sub_dir + 'Img/' + index[:3]
+        dir = self.img_dir + index[:3] + '/'
         if not os.path.exists(dir):
             os.makedirs(dir)
-        path = dir + '/' + index[3:] + '.jpg'
-        print path
+        image = dir + index[3:] + '.jpg'
+        print image
-        if not os.path.exists(path):
-            shutil.copy(image, path)
+        if not os.path.exists(image):
+            shutil.copy(image, image)
         else:
             pass
     def extract(self):
-        for path, subdirs, files in os.walk(self.base_dir + 'Orig/'):
+        for path, subdirs, files in os.walk(self.data_dir):
             for name in files:
                 imagepath = os.path.join(path, name)
                 print imagepath
@@ -62,136 +60,121 @@ class DataCV(DataDumperBase):
         ordict_img = collections.OrderedDict(sorted(self.dict_data.items(), key=lambda d: d[0]))
-        lstfile = self.base_dir + self.sub_dir + 'Img/Image.tsv'
-        with open(lstfile, 'w') as f:
+        with open(self.list_file, 'w') as f:
             tsvfile = csv.writer(f, delimiter='\t')
             for key, value in ordict_img.items():
                 tsvfile.writerow([key] + [value])
+    def get_table(self):
+        if self.table != None:
+            return self.table
-    def get_table(self, tablename, connection=None):
-        if connection is not None:
-            c = connection
-        else:
+        if self.connection is None:
             c = happybase.Connection('HPC-server')
-        tables = c.tables()
-        if tablename not in tables:
+            self.connection = c
+
+        tables = self.connection.tables()
+        if self.table_name not in tables:
             families = {'cf_pic': dict(),
                         'cf_info': dict(max_versions=10),
                         'cf_tag': dict(),
                         'cf_feat': dict(),
             }
-            c.create_table(name=tablename, families=families)
+            self.connection.create_table(name=self.table_name, families=families)
+
+        table = self.connection.table(name=self.table_name)
+
+        self.table = table
-        tb = c.table(name=tablename)
-        return tb
+        return table
-    def store_image(self, table):
-        timer.mark()
-        dir = self.base_dir + self.sub_dir + 'Img/'
-        maplst = dir + 'Image.tsv'
+    def store_image(self):
+        if self.table == None:
+            self.table = self.get_table()
         dict_databuf = {}
-        with open(maplst, 'rb') as tsvfile:
+        with open(self.list_file, 'rb') as tsvfile:
             tsvfile = csv.reader(tsvfile, delimiter='\t')
             for line in tsvfile:
-                path_img = self.base_dir + self.sub_dir + 'Img/Train/' + line[0][:3] + '/' + line[0][3:] + '.jpg'
+                path_img = self.img_dir + + line[0][:3] + '/' + line[0][3:] + '.jpg'
                 if path_img:
                     with open(path_img, 'rb') as fpic:
                         dict_databuf[line[0] + '.jpg'] = fpic.read()
-        timer.report()  # 58.761801s
-        timer.mark()
         try:
-            with table.batch(batch_size=5000) as b:
+            with self.table.batch(batch_size=5000) as b:
                 for imgname, imgdata in dict_databuf.items():
                     b.put(imgname, {'cf_pic:data': imgdata})
                 raise ValueError("Something went wrong!")
         except ValueError:
             pass
-        timer.report()  # 15.570524s
-    def store_tag(self, table):
-        timer.mark()
-        dir = self.base_dir + self.sub_dir + 'Img/'
-        maplst = dir + 'Image.tsv'
+    def store_tag(self, feattype='hog'):
+        if self.table == None:
+            self.table = self.get_table()
         dict_tagbuf = {}
-        with open(maplst, 'rb') as tsvfile:
+        with open(self.list_file, 'rb') as tsvfile:
             tsvfile = csv.reader(tsvfile, delimiter='\t')
             for line in tsvfile:
                 dict_tagbuf[line[0] + '.jpg'] = line[1]
-        timer.report()  # 0.009741s
-        timer.mark()
         try:
-            with table.batch(batch_size=5000) as b:
+            with self.table.batch(batch_size=5000) as b:
                 for imgname, imgtag in dict_tagbuf.items():
-                    b.put(imgname, {'cf_tag:class': imgtag})
+                    b.put(imgname, {'cf_tag:' + feattype: imgtag})
                 raise ValueError("Something went wrong!")
         except ValueError:
             pass
-        timer.report()  # 0.509696s
-    def get_feat(self, category='hog'):
-        dir = self.base_dir + self.sub_dir + 'Img/'
-        maplst = dir + 'images_map_Train.tsv'
+    def get_feat(self, feattype='hog'):
         dict_tagbuf = {}
-
-        with open(maplst, 'rb') as tsvfile:
+        with open(self.list_file, 'rb') as tsvfile:
             tsvfile = csv.reader(tsvfile, delimiter='\t')
             for line in tsvfile:
                 dict_tagbuf[line[0] + '.jpg'] = line[1]
         dict_featbuf = {}
-
-        timer.mark()
         for imgname, imgtag in dict_tagbuf.items():
             # if imgtag == 'True':
-            path_img = self.base_dir + self.sub_dir + 'Img/Train/' + imgname[:3] + '/' + imgname[3:]
-            desc = FeatHOG.feat(path_img, size=(48, 48))
+            image = self.img_dir + imgname[:3] + '/' + imgname[3:]
+            desc = HOG.FeatHOG.feat(image, size=(48, 48))
             dict_featbuf[imgname] = desc
-        timer.report()  # 4.337425s
-        timer.mark()
         for imgname, desc in dict_featbuf.items():
             # print imgname, desc
-            dir = self.base_dir + self.sub_dir + 'Feat/Train/' + imgname[:3] + '/'
+            dir = self.feat_dir + imgname[:3] + '/'
             if not os.path.exists(dir):
                 os.makedirs(dir)
-            featpath = dir + imgname[3:].split('.')[0] + '.' + category
+            featpath = dir + imgname[3:].split('.')[0] + '.' + feattype
             with open(featpath, 'wb') as featfile:
                 featfile.write(json.dumps(desc.tolist()))
-        timer.report()  # 14.862485s
+    def store_feat(self, feattype='hog'):
+        if self.table == None:
+            self.table = self.get_table()
-    def store_feat(self, table):
-        timer.mark()
-        dir = self.base_dir + self.sub_dir + 'Feat/'
         dict_featbuf = {}
-        for path, subdirs, files in os.walk(dir + 'Train/'):
+        for path, subdirs, files in os.walk(self.feat_dir):
             for name in files:
                 featpath = os.path.join(path, name)
                 # print featpath
                 with open(featpath, 'rb') as featfile:
-                    imgname = path.split('/')[-1] + name.replace('.hog', '.jpg')
+                    imgname = path.split('/')[-1] + name.replace('.' + feattype, '.jpg')
                     dict_featbuf[imgname] = featfile.read()
-        timer.report()  # 0.577940s
-
-        timer.mark()
         try:
-            with table.batch(batch_size=5000) as b:
+            with self.table.batch(batch_size=5000) as b:
                 for imgname, featdesc in dict_featbuf.items():
-                    b.put(imgname, {'cf_feat:hog': featdesc})
+                    b.put(imgname, {'cf_feat:' + feattype: featdesc})
                 raise ValueError("Something went wrong!")
         except ValueError:
             pass
-        timer.report()  # 76.075477s
 \ No newline at end of file
+
@@ -19,11 +19,15 @@ import happybase
 class DataMSR(DataDumperBase):
-    def __init__(self, base_dir='/home/hadoop/data/MSR-IRC2014/', sub_dir='Dev/', data_file='DevSetImage.tsv'):
-        DataDumperBase.__init__(self)
-        self.base_dir = base_dir
-        self.sub_dir = sub_dir
-        self.data_file = self.base_dir + self.sub_dir + data_file
+    def __init__(self, base_dir='/home/hadoop/data/MSR-IRC2014/', category='Dev',
+                 data_file='DevSetImage.tsv', tag_file='DevSetLabel.tsv'):
+        DataDumperBase.__init__(self, base_dir, category)
+
+        self.data_file = self.base_dir + self.category + '/' + data_file
+        self.tag_file = self.base_dir + self.category + '/' + tag_file
+        self.map_file = self.base_dir + self.category + '/' + 'images_map.tsv'
+
+        self.table_name = self.base_dir.split('/')[-2] + '-' + self.category
     def format(self):
         self.extract()
@@ -44,105 +48,105 @@ class DataMSR(DataDumperBase):
         with open('res/tmp.jpg', 'rb') as f:
             index = md5(f.read()).hexdigest()
-        dir = self.base_dir + self.sub_dir + 'Img/' + index[:3]
+        dir = self.img_dir + index[:3] + '/'
         if not os.path.exists(dir):
             os.makedirs(dir)
-        path = dir + '/' + index[3:] + '.jpg'
-        print path
+        image = dir + index[3:] + '.jpg'
+        print image
-        if not os.path.exists(path):
-            shutil.copy('res/tmp.jpg', path)
+        if not os.path.exists(image):
+            shutil.copy('res/tmp.jpg', image)
             # or :
-            # img.save(path, format='JPEG')
+            # img.save(image, format='JPEG')
     def extract(self):
-        for name, data in self.load_base64():
-            self.hash_dump(data)
+        for name, data in self._load_base64():
+            self._hash_dump(data)
     def build_list(self):
-        dir = self.base_dir + self.sub_dir
-        lst = dir + 'Image.lst'
-        with open(lst, 'wb') as f:
-            for path, subdirs, files in os.walk(dir):
+        assert self.list_file != None
+        with open(self.list_file, 'wb') as f:
+            for path, subdirs, files in os.walk(self.img_dir):
                 for name in files:
                     entry = path.split('/')[-1] + '/' + name
                     print entry
                     f.write(entry + '\n')
-    def get_table(self, tablename, connection=None):
-        if connection is not None:
-            c = connection
-        else:
+    def get_table(self):
+        if self.table != None:
+            return self.table
+
+        if self.connection is None:
             c = happybase.Connection('HPC-server')
-        tables = c.tables()
-        if tablename not in tables:
+            self.connection = c
+
+        tables = self.connection.tables()
+        if self.table_name not in tables:
             families = {'cf_pic': dict(),
                         'cf_info': dict(max_versions=10),
                         'cf_tag': dict(),
+                        'cf_feat': dict(),
             }
-            c.create_table(name=tablename, families=families)
+            self.connection.create_table(name=self.table_name, families=families)
+
+        table = self.connection.table(name=self.table_name)
-        tb = c.table(name=tablename)
-        return tb
+        self.table = table
+        return table
+
+
+    def store_image(self):
+        if self.table == None:
+            self.table = self.get_table()
-    def store_image(self, table):
-        timer.mark()
-        dir = self.base_dir + self.sub_dir + 'Img2/'
-        lst = dir + 'Image.lst'
         dict_buffer = {}
-        with open(lst, 'rb') as f:
+        with open(self.list_file, 'rb') as f:
             for line in f:
                 path_img = line.strip('\n')
                 if path_img:
-                    with open(dir + 'Dev/' + path_img, 'rb') as fpic:
+                    with open(self.img_dir + path_img, 'rb') as fpic:
                         dict_buffer[path_img.replace('/', '')] = fpic.read()
-        timer.report()  # 1.507566s
-        timer.mark()
+
         try:
-            with table.batch(batch_size=5000) as b:
+            with self.table.batch(batch_size=5000) as b:
                 for imgname, imgdata in dict_buffer.items():
                     b.put(imgname, {'cf_pic:data': imgdata})
                 raise ValueError("Something went wrong!")
         except ValueError:
             pass
-        timer.report()  # 228.003684s
-    def store_tag(self, table):
-        timer.mark()
-        dir = self.base_dir + self.sub_dir + 'Img2/'
-        maplst = dir + 'Image.tsv'
-        taglist = self.base_dir + self.sub_dir + 'Dev/DevSetLabel.tsv'
+    def store_tag(self):
+        if self.table == None:
+            self.table = self.get_table()
+
         dict_namebuf = {}
         dict_tagbuf = {}
-        with open(maplst, 'rb') as tsvfile:
+        with open(self.map_file, 'rb') as tsvfile:
             tsvfile = csv.reader(tsvfile, delimiter='\t')
             for line in tsvfile:
                 dict_namebuf[line[0]] = line[2]
-        with open(taglist, 'rb') as tsvfile:
+        with open(self.tag_file, 'rb') as tsvfile:
             tsvfile = csv.reader(tsvfile, delimiter='\t')
             for line in tsvfile:
                 dict_tagbuf[line[-2]] = (line[:-2], line[-1])
-        timer.report()  # 0.148540s
-        timer.mark()
         try:
-            with table.batch(batch_size=5000) as b:
+            with self.table.batch(batch_size=5000) as b:
                 for key, value in dict_tagbuf.items():
                     b.put(dict_namebuf[key] + '.jpg', {'cf_tag:' + ''.join(value[0]): value[1]})
                 raise ValueError("Something went wrong!")
         except ValueError:
             pass
-        timer.report()  # 3.280105s
-    def get_feat(self, category):
+    def get_feat(self, feattype):
         pass
-    def store_feat(self, table, category):
+    def store_feat(self, feattype):
         pass
+# -*- coding: utf-8 -*-
 __author__ = 'chunk'
-__all__ = ['DataDumperBase', ]
+__all__ = ['DataDumperBase']
 class DataDumperBase(object):
@@ -8,17 +9,33 @@ class DataDumperBase(object):
     Base class for image data dumping & retrieving.
     A regular directory pattern would be like this:
-        ├── file-tag-list.tsv
-        │
-        ├── Feat
-        │   ├── 0a1
-        │   └── 53e
-        │   └── ...
-        |
-        └── Img
-            ├── 0a1
-            └── 53e
-            └── ...
+        ├── Dev (category)
+            ├── file-tag.tsv (list_file)
+            │
+            ├── Feat (feat_dir)
+            │   ├── 0a1
+            │   └── 53e
+            │   └── ...
+            |
+            └── Img (img_dir)
+                ├── 0a1
+                └── 53e
+                └── ...
+        ├── Train (category)
+            ├── file-tag.tsv (list_file)
+            │
+            ├── Feat
+            │   ├── 032
+            │   └── a21
+            │   └── ...
+            |
+            └── Img
+                ├── 032
+                └── a21
+                └── ...
+        .
+        .
+        .
     It can be refractored from the original pattern which is supposed to be generated from web crawlers:
@@ -32,38 +49,47 @@ class DataDumperBase(object):
             └── ddd.jpg
             └── ...
+
     convention:
         'img' for image file data while 'image' for file path;
     """
-    def __init__(self):
-        self.base_dir = None
-        self.list_file = None
-        self.dict_data = None
+    def __init__(self, base_dir, category):
+        """
+        base_dir: e.g. '/home/hadoop/data/MSR-IRC2014/'
+        list_file: not data_file! e.g. 'file-tag.tsv'
+        dict_data: e.g. {'filename':rawdata} or {'filename':tag}
+        """
+        self.base_dir = base_dir
+        self.category = category
+        self.dst_dir = self.base_dir + 'dst/' + self.category + '/'
+
+        self.list_file = self.dst_dir + 'file-tag.tsv'
+        self.feat_dir = self.dst_dir + 'Feat/'
+        self.img_dir = self.dst_dir + 'Img/'
-        # self.table_name = None
-        # self.table = None
-        # self.connection = None
+        self.table_name = None
+        self.table = None
+        self.connection = None
     def format(self):
         pass
-
-    def get_table(self, tablename, connection=None):
+    def get_table(self):
         pass
-    def store_img(self, table):
+    def store_img(self):
         pass
-    def store_tag(self, table, category):
+    def store_tag(self, feattype):
         pass
-    def get_feat(self, category):
+    def get_feat(self, feattype):
         pass
-    def store_feat(self, table, category):
+    def store_feat(self, feattype):
         pass
@@ -7,7 +7,7 @@ import cv2
 from skimage.feature import hog
 from skimage import io, color, transform, exposure
-__all__ = ['FeatureBase', 'FeatHOG', 'timer']
+__all__ = ['FeatureBase']
 timer = ctimer()
@@ -0,0 +1,10 @@
+__author__ = 'chunk'
+
+
+from mdata import MSR
+
+msrd = MSR.DataMSR(base_dir='/media/chunk/Elements/D/data/MSR-IRC2014/',category='Train',data_file='TrainImageSet.tsv', tag_file='TrainSetLabel.tsv')
+# msrd.format()
+# msrd.build_list()
+
+print 'helllo'
 \ No newline at end of file
	@@ -7,7 +7,7 @@ import cv2		@@ -7,7 +7,7 @@ import cv2
7	from skimage.feature import hog	7	from skimage.feature import hog
8	from skimage import io, color, transform, exposure	8	from skimage import io, color, transform, exposure
9		9
10	-__all__ = ['FeatureBase', 'FeatHOG', 'timer']	10	+__all__ = ['FeatureBase']
11		11
12	timer = ctimer()	12	timer = ctimer()
13		13
	@@ -0,0 +1,10 @@		@@ -0,0 +1,10 @@
		1	+__author__ = 'chunk'
		2	+
		3	+
		4	+from mdata import MSR
		5	+
		6	+msrd = MSR.DataMSR(base_dir='/media/chunk/Elements/D/data/MSR-IRC2014/',category='Train',data_file='TrainImageSet.tsv', tag_file='TrainSetLabel.tsv')
		7	+# msrd.format()
		8	+# msrd.build_list()
		9	+
		10	+print 'helllo'
0	\ No newline at end of file	11	\ No newline at end of file