CV.py 5.66 KB
__author__ = 'chunk'

from mdata import *
from mfeat import HOG

import os, sys
from PIL import Image
from hashlib import md5
import csv
import shutil
from common import *
import json
import collections
import happybase


class DataCV(DataDumperBase):
    def __init__(self, base_dir='/home/hadoop/data/HeadShoulder/', category='Train'):
        DataDumperBase.__init__(self, base_dir, category)
        self.data_dir = self.base_dir + self.category + '/'
        self.dict_data = {}

    def format(self):
        self.extract()

    def _hash_copy(self, image, ispos):
        if not image.endswith('jpg'):
            img = Image.open(image)
            img.save('res/tmp.jpg', format='JPEG')
            image = 'res/tmp.jpg'

        with open(image, 'rb') as f:
            index = md5(f.read()).hexdigest()

        self.dict_data[index] = ispos

        # origion:
        # dir = base_dir + 'Img/Train/' + index[:3]
        dir = self.img_dir + index[:3] + '/'
        if not os.path.exists(dir):
            os.makedirs(dir)
        image = dir + index[3:] + '.jpg'
        print image

        if not os.path.exists(image):
            shutil.copy(image, image)
        else:
            pass


    def extract(self):
        for path, subdirs, files in os.walk(self.data_dir):
            for name in files:
                imagepath = os.path.join(path, name)
                print imagepath
                if imagepath.split('/')[-2].startswith('pos'):
                    self._hash_copy(imagepath, True)
                else:
                    self._hash_copy(imagepath, False)

        ordict_img = collections.OrderedDict(sorted(self.dict_data.items(), key=lambda d: d[0]))

        with open(self.list_file, 'w') as f:
            tsvfile = csv.writer(f, delimiter='\t')
            for key, value in ordict_img.items():
                tsvfile.writerow([key] + [value])

    def get_table(self):
        if self.table != None:
            return self.table

        if self.connection is None:
            c = happybase.Connection('HPC-server')
            self.connection = c

        tables = self.connection.tables()
        if self.table_name not in tables:
            families = {'cf_pic': dict(),
                        'cf_info': dict(max_versions=10),
                        'cf_tag': dict(),
                        'cf_feat': dict(),
            }
            self.connection.create_table(name=self.table_name, families=families)

        table = self.connection.table(name=self.table_name)

        self.table = table

        return table


    def store_image(self):
        if self.table == None:
            self.table = self.get_table()

        dict_databuf = {}

        with open(self.list_file, 'rb') as tsvfile:
            tsvfile = csv.reader(tsvfile, delimiter='\t')
            for line in tsvfile:
                path_img = self.img_dir + + line[0][:3] + '/' + line[0][3:] + '.jpg'
                if path_img:
                    with open(path_img, 'rb') as fpic:
                        dict_databuf[line[0] + '.jpg'] = fpic.read()

        try:
            with self.table.batch(batch_size=5000) as b:
                for imgname, imgdata in dict_databuf.items():
                    b.put(imgname, {'cf_pic:data': imgdata})
                raise ValueError("Something went wrong!")
        except ValueError:
            pass


    def store_tag(self, feattype='hog'):
        if self.table == None:
            self.table = self.get_table()

        dict_tagbuf = {}

        with open(self.list_file, 'rb') as tsvfile:
            tsvfile = csv.reader(tsvfile, delimiter='\t')
            for line in tsvfile:
                dict_tagbuf[line[0] + '.jpg'] = line[1]

        try:
            with self.table.batch(batch_size=5000) as b:
                for imgname, imgtag in dict_tagbuf.items():
                    b.put(imgname, {'cf_tag:' + feattype: imgtag})
                raise ValueError("Something went wrong!")
        except ValueError:
            pass


    def get_feat(self, feattype='hog'):

        dict_tagbuf = {}
        with open(self.list_file, 'rb') as tsvfile:
            tsvfile = csv.reader(tsvfile, delimiter='\t')
            for line in tsvfile:
                dict_tagbuf[line[0] + '.jpg'] = line[1]

        dict_featbuf = {}
        for imgname, imgtag in dict_tagbuf.items():
            # if imgtag == 'True':
            image = self.img_dir + imgname[:3] + '/' + imgname[3:]
            desc = HOG.FeatHOG.feat(image, size=(48, 48))
            dict_featbuf[imgname] = desc

        for imgname, desc in dict_featbuf.items():
            # print imgname, desc
            dir = self.feat_dir + imgname[:3] + '/'
            if not os.path.exists(dir):
                os.makedirs(dir)
            featpath = dir + imgname[3:].split('.')[0] + '.' + feattype
            with open(featpath, 'wb') as featfile:
                featfile.write(json.dumps(desc.tolist()))


    def store_feat(self, feattype='hog'):
        if self.table == None:
            self.table = self.get_table()

        dict_featbuf = {}
        for path, subdirs, files in os.walk(self.feat_dir):
            for name in files:
                featpath = os.path.join(path, name)
                # print featpath
                with open(featpath, 'rb') as featfile:
                    imgname = path.split('/')[-1] + name.replace('.' + feattype, '.jpg')
                    dict_featbuf[imgname] = featfile.read()

        try:
            with self.table.batch(batch_size=5000) as b:
                for imgname, featdesc in dict_featbuf.items():
                    b.put(imgname, {'cf_feat:' + feattype: featdesc})
                raise ValueError("Something went wrong!")
        except ValueError:
            pass