Chunk / ImageR

Blame view

mdata/CV.py 6.19 KB
__author__ = 'chunk'

from mdata import *
from mfeat import *

import os, sys
from PIL import Image
from hashlib import md5
import csv
import shutil
from common import *
import json
import collections
import happybase


class DataCV(DataDumperBase):
    def __init__(self, base_dir='/home/hadoop/data/HeadShoulder/', sub_dir='Train/'):
        DataDumperBase.__init__(self)
        self.base_dir = base_dir
        self.sub_dir = sub_dir

        self.dict_data = {}

    def format(self):
        self.extract()

    def _hash_copy(self, image, ispos):
        if not image.endswith('jpg'):
            img = Image.open(image)
            img.save('res/tmp.jpg', format='JPEG')
            image = 'res/tmp.jpg'

        with open(image, 'rb') as f:
            index = md5(f.read()).hexdigest()

        self.dict_data[index] = ispos

        # origion:
        # dir = base_dir + 'Img/Train/' + index[:3]
        dir = self.base_dir + self.sub_dir + 'Img/' + index[:3]
        if not os.path.exists(dir):
            os.makedirs(dir)
        path = dir + '/' + index[3:] + '.jpg'
        print path

        if not os.path.exists(path):
            shutil.copy(image, path)
        else:
            pass


    def extract(self):
        for path, subdirs, files in os.walk(self.base_dir + 'Orig/'):
            for name in files:
                imagepath = os.path.join(path, name)
                print imagepath
                if imagepath.split('/')[-2].startswith('pos'):
                    self._hash_copy(imagepath, True)
                else:
                    self._hash_copy(imagepath, False)

        ordict_img = collections.OrderedDict(sorted(self.dict_data.items(), key=lambda d: d[0]))

        lstfile = self.base_dir + self.sub_dir + 'Img/Image.tsv'
        with open(lstfile, 'w') as f:
            tsvfile = csv.writer(f, delimiter='\t')
            for key, value in ordict_img.items():
                tsvfile.writerow([key] + [value])


    def get_table(self, tablename, connection=None):
        if connection is not None:
            c = connection
        else:
            c = happybase.Connection('HPC-server')
        tables = c.tables()
        if tablename not in tables:
            families = {'cf_pic': dict(),
                        'cf_info': dict(max_versions=10),
                        'cf_tag': dict(),
                        'cf_feat': dict(),
            }
            c.create_table(name=tablename, families=families)

        tb = c.table(name=tablename)
        return tb


    def store_image(self, table):
        timer.mark()
        dir = self.base_dir + self.sub_dir + 'Img/'
        maplst = dir + 'Image.tsv'

        dict_databuf = {}

        with open(maplst, 'rb') as tsvfile:
            tsvfile = csv.reader(tsvfile, delimiter='\t')
            for line in tsvfile:
                path_img = self.base_dir + self.sub_dir + 'Img/Train/' + line[0][:3] + '/' + line[0][3:] + '.jpg'
                if path_img:
                    with open(path_img, 'rb') as fpic:
                        dict_databuf[line[0] + '.jpg'] = fpic.read()

        timer.report()  # 58.761801s
        timer.mark()
        try:
            with table.batch(batch_size=5000) as b:
                for imgname, imgdata in dict_databuf.items():
                    b.put(imgname, {'cf_pic:data': imgdata})
                raise ValueError("Something went wrong!")
        except ValueError:
            pass
        timer.report()  # 15.570524s


    def store_tag(self, table):
        timer.mark()
        dir = self.base_dir + self.sub_dir + 'Img/'
        maplst = dir + 'Image.tsv'

        dict_tagbuf = {}

        with open(maplst, 'rb') as tsvfile:
            tsvfile = csv.reader(tsvfile, delimiter='\t')
            for line in tsvfile:
                dict_tagbuf[line[0] + '.jpg'] = line[1]

        timer.report()  # 0.009741s
        timer.mark()
        try:
            with table.batch(batch_size=5000) as b:
                for imgname, imgtag in dict_tagbuf.items():
                    b.put(imgname, {'cf_tag:class': imgtag})
                raise ValueError("Something went wrong!")
        except ValueError:
            pass
        timer.report()  # 0.509696s


    def get_feat(self, category='hog'):
        dir = self.base_dir + self.sub_dir + 'Img/'
        maplst = dir + 'images_map_Train.tsv'

        dict_tagbuf = {}

        with open(maplst, 'rb') as tsvfile:
            tsvfile = csv.reader(tsvfile, delimiter='\t')
            for line in tsvfile:
                dict_tagbuf[line[0] + '.jpg'] = line[1]

        dict_featbuf = {}

        timer.mark()
        for imgname, imgtag in dict_tagbuf.items():
            # if imgtag == 'True':
            path_img = self.base_dir + self.sub_dir + 'Img/Train/' + imgname[:3] + '/' + imgname[3:]
            desc = FeatHOG.feat(path_img, size=(48, 48))
            dict_featbuf[imgname] = desc
        timer.report()  # 4.337425s

        timer.mark()
        for imgname, desc in dict_featbuf.items():
            # print imgname, desc
            dir = self.base_dir + self.sub_dir + 'Feat/Train/' + imgname[:3] + '/'
            if not os.path.exists(dir):
                os.makedirs(dir)
            featpath = dir + imgname[3:].split('.')[0] + '.' + category
            with open(featpath, 'wb') as featfile:
                featfile.write(json.dumps(desc.tolist()))

        timer.report()  # 14.862485s


    def store_feat(self, table):
        timer.mark()
        dir = self.base_dir + self.sub_dir + 'Feat/'
        dict_featbuf = {}
        for path, subdirs, files in os.walk(dir + 'Train/'):
            for name in files:
                featpath = os.path.join(path, name)
                # print featpath
                with open(featpath, 'rb') as featfile:
                    imgname = path.split('/')[-1] + name.replace('.hog', '.jpg')
                    dict_featbuf[imgname] = featfile.read()

        timer.report()  # 0.577940s

        timer.mark()
        try:
            with table.batch(batch_size=5000) as b:
                for imgname, featdesc in dict_featbuf.items():
                    b.put(imgname, {'cf_feat:hog': featdesc})
                raise ValueError("Something went wrong!")
        except ValueError:
            pass
        timer.report()  # 76.075477s