# -*- coding: utf-8 -*- __author__ = 'chunk' from ..common import * from .dependencies import * from . import * # from ..mdata import MSR, CV, ILSVRC, ILSVRC_S from ..mjpeg import * from ..msteg import * from ..msteg.steganography import LSB, F3, F4, F5 from ..mfeat import IntraBlockDiff import sys from pyspark import SparkConf, SparkContext from pyspark.mllib.classification import LogisticRegressionWithSGD, SVMWithSGD from pyspark.mllib.regression import LabeledPoint from numpy import array import json import pickle import tempfile import numpy as np from scipy import stats from hashlib import md5 np.random.seed(sum(map(ord, "whoami"))) package_dir = os.path.dirname(os.path.abspath(__file__)) def rddparse_data_CV(raw_row): """ input: (u'key0',u'cf_feat:hog:[0.056273,...]--%--cf_pic:data:\ufffd\ufffd\...--%--cf_tag:hog:True') return: ([0.056273,...],1) """ data = raw_row[1].split('--%--') feat = json.loads(data[0].split(':')[-1]) tag = 1 if data[-1].split(':')[-1] == 'True' else 0 return (feat, tag) def rddparse_data_ILS(raw_row): """ input: (u'key0',u'cf_feat:hog:[0.056273,...]--%--cf_pic:data:\ufffd\ufffd\...--%--cf_tag:hog:True') return: ([0.056273,...],1) In fact we can also use mapValues. """ key = raw_row[0] # if key == '04650c488a2b163ca8a1f52da6022f03.jpg': # with open('/tmp/hhhh','wb') as f: # f.write(raw_row[1].decode('unicode-escape')).encode('latin-1') items = raw_row[1].decode('unicode-escape').encode('latin-1').split('--%--') data = items[0].split('cf_pic:data:')[-1] return (key, data) def rddparse_all_ILS(raw_row): key = raw_row[0] items = raw_row[1].decode('unicode-escape').encode('latin-1').split('--%--') data = [items[0].split('cf_pic:data:')[-1]] + [json.loads(item.split(':')[-1]) for item in items[1:]] return (key, data) def rddinfo_ILS(img, info_rate=None, tag_chosen=None, tag_class=None): """ Tempfile is our friend. (?) """ info_rate = info_rate if info_rate != None else 0.0 tag_chosen = tag_chosen if tag_chosen != None else stats.bernoulli.rvs(0.8) tag_class = tag_class if tag_class != None else 0 try: tmpf = tempfile.NamedTemporaryFile(suffix='.jpg', mode='w+b') tmpf.write(img) tmpf.seek(0) im = Jpeg(tmpf.name, key=sample_key) info = [ im.image_width, im.image_height, im.image_width * im.image_height, im.getCapacity(), im.getQuality(), info_rate, tag_chosen, tag_class ] return info except Exception as e: print e raise finally: tmpf.close() def rddembed_ILS(row): """ input: e.g. row =('row1',[1,3400,'hello']) return: newrow = ('row2',[34,5400,'embeded']) """ items = row[1] capacity, rate, chosen = int(items[4]), float(items[6]), int(items[7]) if chosen == 0: return None try: tmpf_src = tempfile.NamedTemporaryFile(suffix='.jpg', mode='w+b') tmpf_src.write(items[0]) tmpf_src.seek(0) tmpf_dst = tempfile.NamedTemporaryFile(suffix='.jpg', mode='w+b') steger = F5.F5(sample_key, 1) if rate == None: embed_rate = steger.embed_raw_data(tmpf_src.name, os.path.join(package_dir, '../res/toembed'), tmpf_dst.name) else: assert (rate >= 0 and rate < 1) # print capacity hidden = np.random.bytes(int(int(capacity) * rate) / 8) embed_rate = steger.embed_raw_data(tmpf_src.name, hidden, tmpf_dst.name, frommem=True) tmpf_dst.seek(0) raw = tmpf_dst.read() index = md5(raw).hexdigest() return (index + '.jpg', [raw] + rddinfo_ILS(raw, embed_rate, 0, 1)) except Exception as e: print e raise finally: tmpf_src.close() tmpf_dst.close() def _get_feat(image, feattype='ibd', **kwargs): if feattype == 'ibd': feater = IntraBlockDiff.FeatIntraBlockDiff() else: raise Exception("Unknown feature type!") desc = feater.feat(image) return desc def rddfeat_ILS(row, feattype='ibd', **kwargs): items = row[1] capacity, rate, chosen = int(items[4]), float(items[6]), int(items[7]) try: tmpf_src = tempfile.NamedTemporaryFile(suffix='.jpg', mode='w+b') tmpf_src.write(items[0]) tmpf_src.seek(0) desc = json.dumps(_get_feat(tmpf_src.name, feattype=feattype).tolist()) return (row[0], row[1].append(desc)) except Exception as e: print e raise finally: tmpf_src.close() def format_out(row, cols): """ input: e.g. row =('row1',[1,3400,'hello']) cols = [['cf_info', 'id'], ['cf_info', 'size'], ['cf_tag', 'desc']] return: [('row1',['row1', 'cf_info', 'id', '1']),('row1',['row1', 'cf_info', 'size', '3400']),('row1',['row1', 'cf_tag', 'desc', 'hello'])] """ puts = [] key = row[0] for data, col in zip(row[1], cols): puts.append((key, [key] + col + [str(data)])) return puts class Sparker(object): def __init__(self, host='HPC-server', appname='NewPySparkApp', **kwargs): load_env() self.host = host self.appname = appname self.master = kwargs.get('master', 'spark://%s:7077' % self.host) self.conf = SparkConf() self.conf.setSparkHome(self.host) \ .setMaster(self.master) \ .setAppName(self.appname) # self.conf.set("spark.akka.frameSize","10685760") # self.conf.set("spark.driver.extraClassPath", extraClassPath) \ # .set("spark.executor.extraClassPath", extraClassPath) \ # .set("SPARK_CLASSPATH", extraClassPath) \ # .set("spark.driver.memory", "1G") \ # .set("spark.yarn.jar", sparkJar) self.sc = SparkContext(conf=self.conf) self.model = None def read_hbase(self, table_name, func=None, collect=False): """ ref - http://happybase.readthedocs.org/en/latest/user.html#retrieving-data Filter format: columns=['cf1:col1', 'cf1:col2'] or columns=['cf1'] """ hconf = {"hbase.zookeeper.quorum": self.host, "hbase.mapreduce.inputtable": table_name, } hbase_rdd = self.sc.newAPIHadoopRDD(inputFormatClass=hparams["inputFormatClass"], keyClass=hparams["readKeyClass"], valueClass=hparams["readValueClass"], keyConverter=hparams["readKeyConverter"], valueConverter=hparams["readValueConverter"], conf=hconf) parser = func if func != None else rddparse_data_CV hbase_rdd = hbase_rdd.map(lambda x: parser(x)) if collect: return hbase_rdd.collect() else: return hbase_rdd def write_hbase(self, table_name, data, fromrdd=False, columns=None): """ Data Format: (Deprecated) e.g. [["row8", "f1", "", "caocao cao"], ["row9", "f1", "c1", "asdfg hhhh"]] Data(from dictionary): e.g. data ={'row1':[1,3400,'hello'], 'row2':[34,5000,'here in mine']}, cols = ['cf_info:id', 'cf_info:size', 'cf_tag:desc'] Data(from Rdd): e.g. data =[('row1',[1,3400,'hello']), ('row2',[34,5000,'here in mine'])], cols = ['cf_info:id', 'cf_info:size', 'cf_tag:desc'] """ hconf = {"hbase.zookeeper.quorum": self.host, "hbase.mapreduce.inputtable": table_name, "hbase.mapred.outputtable": table_name, "mapreduce.outputformat.class": hparams["outputFormatClass"], "mapreduce.job.output.key.class": hparams["writeKeyClass"], "mapreduce.job.output.value.class": hparams["writeValueClass"], } cols = [col.split(':') for col in columns] if not fromrdd: rdd_data = self.sc.parallelize(data) else: rdd_data = data rdd_data.flatMap(lambda x: format_out(x, cols)).saveAsNewAPIHadoopDataset( conf=hconf, keyConverter=hparams["writeKeyConverter"], valueConverter=hparams["writeValueConverter"]) def train_svm(self, rdd_labeled): svm = SVMWithSGD.train(rdd_labeled) self.model = svm return svm def train_svm(self, X, Y): # data = [] # for feat, tag in zip(X, Y): # data.append(LabeledPoint(tag, feat)) # svm = SVMWithSGD.train(self.sc.parallelize(data)) hdd_data = self.sc.parallelize(zip(X, Y), 20).map(lambda x: LabeledPoint(x[1], x[0])) svm = SVMWithSGD.train(hdd_data) self.model = svm # with open('res/svm_spark.model', 'wb') as modelfile: # model = pickle.dump(svm, modelfile) return svm def predict_svm(self, x, model=None): if model is None: if self.model != None: model = self.model else: # with open('res/svm_spark.model', 'rb') as modelfile: # model = pickle.load(modelfile) raise Exception("No model available!") return model.predict(x) def test_svm(self, X, Y, model=None): pass