Commit 712861f0014b007d5e6bf52ced1d57d2ff644ecc
1 parent
f005aa4a
Exists in
refactor
extract rdd from SC.
Showing
5 changed files
with
309 additions
and
371 deletions
Show diff stats
mdata/ILSVRC.py
| 1 | 1 | __author__ = 'chunk' |
| 2 | 2 | |
| 3 | 3 | from . import * |
| 4 | -from ..mfeat import HOG, IntraBlockDiff | |
| 5 | -from ..mspark import SC | |
| 4 | +from ..mfeat import IntraBlockDiff | |
| 5 | +from ..mspark import rdd, SC | |
| 6 | 6 | from ..common import * |
| 7 | 7 | |
| 8 | 8 | import os, sys |
| ... | ... | @@ -83,11 +83,11 @@ class DataILSVRC(DataDumperBase): |
| 83 | 83 | pass |
| 84 | 84 | |
| 85 | 85 | def get_feat(self, image, feattype='ibd', **kwargs): |
| 86 | - size = kwargs.get('size', (48, 48)) | |
| 87 | - | |
| 88 | - if feattype == 'hog': | |
| 89 | - feater = HOG.FeatHOG(size=size) | |
| 90 | - elif feattype == 'ibd': | |
| 86 | + # size = kwargs.get('size', (48, 48)) | |
| 87 | + # | |
| 88 | + # if feattype == 'hog': | |
| 89 | + # feater = HOG.FeatHOG(size=size) | |
| 90 | + if feattype == 'ibd': | |
| 91 | 91 | feater = IntraBlockDiff.FeatIntraBlockDiff() |
| 92 | 92 | else: |
| 93 | 93 | raise Exception("Unknown feature type!") |
| ... | ... | @@ -99,9 +99,9 @@ class DataILSVRC(DataDumperBase): |
| 99 | 99 | |
| 100 | 100 | def extract_feat(self, feattype='ibd'): |
| 101 | 101 | print "extracting feat..." |
| 102 | - if feattype == 'hog': | |
| 103 | - feater = HOG.FeatHOG(size=(48, 48)) | |
| 104 | - elif feattype == 'ibd': | |
| 102 | + # if feattype == 'hog': | |
| 103 | + # feater = HOG.FeatHOG(size=(48, 48)) | |
| 104 | + if feattype == 'ibd': | |
| 105 | 105 | feater = IntraBlockDiff.FeatIntraBlockDiff() |
| 106 | 106 | else: |
| 107 | 107 | raise Exception("Unknown feature type!") |
| ... | ... | @@ -307,7 +307,7 @@ class DataILSVRC(DataDumperBase): |
| 307 | 307 | # cv2.imwrite(os.path.join(base_dir, category + '_crop_cv', name), img_crop) |
| 308 | 308 | # except Exception as e: |
| 309 | 309 | # print '[EXCPT]', e |
| 310 | - # pass | |
| 310 | + # pass | |
| 311 | 311 | |
| 312 | 312 | |
| 313 | 313 | def get_table(self): |
| ... | ... | @@ -322,10 +322,10 @@ class DataILSVRC(DataDumperBase): |
| 322 | 322 | tables = self.connection.tables() |
| 323 | 323 | if self.table_name not in tables: |
| 324 | 324 | families_compressed = {'cf_pic': dict(compression='LZO'), |
| 325 | - 'cf_info': dict(max_versions=10,compression='LZO'), | |
| 326 | - 'cf_tag': dict(compression='LZO'), | |
| 327 | - 'cf_feat': dict(compression='LZO'), | |
| 328 | - } | |
| 325 | + 'cf_info': dict(max_versions=10, compression='LZO'), | |
| 326 | + 'cf_tag': dict(compression='LZO'), | |
| 327 | + 'cf_feat': dict(compression='LZO'), | |
| 328 | + } | |
| 329 | 329 | families = {'cf_pic': dict(), |
| 330 | 330 | 'cf_info': dict(max_versions=10), |
| 331 | 331 | 'cf_tag': dict(), | ... | ... |
mdata/ILSVRC_S.py
| 1 | 1 | __author__ = 'chunk' |
| 2 | 2 | |
| 3 | 3 | from . import * |
| 4 | -from ..mfeat import HOG, IntraBlockDiff | |
| 5 | -from ..mspark import SC | |
| 4 | +from ..mfeat import IntraBlockDiff | |
| 5 | +from ..mspark import rdd, SC | |
| 6 | 6 | from pyspark.mllib.regression import LabeledPoint |
| 7 | 7 | from ..common import * |
| 8 | 8 | |
| ... | ... | @@ -135,11 +135,11 @@ class DataILSVRC_S(DataDumperBase): |
| 135 | 135 | tmpf.close() |
| 136 | 136 | |
| 137 | 137 | def _get_feat(self, image, feattype='ibd', **kwargs): |
| 138 | - size = kwargs.get('size', (48, 48)) | |
| 139 | - | |
| 140 | - if feattype == 'hog': | |
| 141 | - feater = HOG.FeatHOG(size=size) | |
| 142 | - elif feattype == 'ibd': | |
| 138 | + # size = kwargs.get('size', (48, 48)) | |
| 139 | + # | |
| 140 | + # if feattype == 'hog': | |
| 141 | + # feater = HOG.FeatHOG(size=size) | |
| 142 | + if feattype == 'ibd': | |
| 143 | 143 | feater = IntraBlockDiff.FeatIntraBlockDiff() |
| 144 | 144 | else: |
| 145 | 145 | raise Exception("Unknown feature type!") |
| ... | ... | @@ -267,16 +267,16 @@ class DataILSVRC_S(DataDumperBase): |
| 267 | 267 | ] |
| 268 | 268 | |
| 269 | 269 | # # Debug |
| 270 | - # tmp_data = self.sparker.read_hbase(self.table_name, func=SC.rddparse_data_ILS, | |
| 270 | + # tmp_data = self.sparker.read_hbase(self.table_name, func=rdd.rddparse_data_ILS, | |
| 271 | 271 | # collect=False) |
| 272 | - # # tmp_data = tmp_data.mapValues(lambda data: [data] + SC.rddinfo_ILS(data)) | |
| 272 | + # # tmp_data = tmp_data.mapValues(lambda data: [data] + rdd.rddinfo_ILS(data)) | |
| 273 | 273 | # print tmp_data.collect()[0][1] |
| 274 | 274 | # return |
| 275 | 275 | |
| 276 | 276 | |
| 277 | - self.rdd_data = self.sparker.read_hbase(self.table_name, func=SC.rddparse_data_ILS, | |
| 277 | + self.rdd_data = self.sparker.read_hbase(self.table_name, func=rdd.rddparse_data_ILS, | |
| 278 | 278 | collect=False).mapValues( |
| 279 | - lambda data: [data] + SC.rddinfo_ILS(data)) | |
| 279 | + lambda data: [data] + rdd.rddinfo_ILS(data)) | |
| 280 | 280 | |
| 281 | 281 | if not writeback: |
| 282 | 282 | return self.rdd_data |
| ... | ... | @@ -293,14 +293,14 @@ class DataILSVRC_S(DataDumperBase): |
| 293 | 293 | ] |
| 294 | 294 | |
| 295 | 295 | # # Debug |
| 296 | - # tmp_data = self.sparker.read_hbase(self.table_name, func=SC.rddparse_data_ILS, | |
| 296 | + # tmp_data = self.sparker.read_hbase(self.table_name, func=rdd.rddparse_data_ILS, | |
| 297 | 297 | # collect=False) |
| 298 | - # # tmp_data = tmp_data.mapValues(lambda data: [data] + SC.rddinfo_ILS(data)) | |
| 298 | + # # tmp_data = tmp_data.mapValues(lambda data: [data] + rdd.rddinfo_ILS(data)) | |
| 299 | 299 | # print tmp_data.collect()[0][1] |
| 300 | 300 | # return |
| 301 | 301 | |
| 302 | 302 | |
| 303 | - self.rdd_data = self.sparker.read_hbase(self.table_name, func=SC.rddparse_data_ILS, | |
| 303 | + self.rdd_data = self.sparker.read_hbase(self.table_name, func=rdd.rddparse_data_ILS, | |
| 304 | 304 | collect=False).mapValues( |
| 305 | 305 | lambda data: [data]) |
| 306 | 306 | |
| ... | ... | @@ -417,12 +417,12 @@ class DataILSVRC_S(DataDumperBase): |
| 417 | 417 | ] |
| 418 | 418 | |
| 419 | 419 | if readforward: |
| 420 | - self.rdd_data = self.sparker.read_hbase(self.table_name, func=SC.rddparse_all_ILS, collect=False) | |
| 420 | + self.rdd_data = self.sparker.read_hbase(self.table_name, func=rdd.rddparse_all_ILS, collect=False) | |
| 421 | 421 | |
| 422 | - # rdd_data_ext = self.rdd_data.map(lambda x: SC.rddembed_ILS(x, rate=rate)).filter(lambda x: x != None) | |
| 422 | + # rdd_data_ext = self.rdd_data.map(lambda x: rdd.rddembed_ILS(x, rate=rate)).filter(lambda x: x != None) | |
| 423 | 423 | # self.rdd_data = self.rdd_data.union(rdd_data_ext) |
| 424 | 424 | |
| 425 | - self.rdd_data = self.rdd_data.flatMap(lambda x: SC.rddembed_ILS_EXT(x, rate=rate)) | |
| 425 | + self.rdd_data = self.rdd_data.flatMap(lambda x: rdd.rddembed_ILS_EXT(x, rate=rate)) | |
| 426 | 426 | if not writeback: |
| 427 | 427 | return self.rdd_data |
| 428 | 428 | else: |
| ... | ... | @@ -513,9 +513,9 @@ class DataILSVRC_S(DataDumperBase): |
| 513 | 513 | ] |
| 514 | 514 | |
| 515 | 515 | if readforward: |
| 516 | - self.rdd_data = self.sparker.read_hbase(self.table_name, func=SC.rddparse_all_ILS, collect=False) | |
| 516 | + self.rdd_data = self.sparker.read_hbase(self.table_name, func=rdd.rddparse_all_ILS, collect=False) | |
| 517 | 517 | |
| 518 | - self.rdd_data = self.rdd_data.mapValues(lambda items: SC.rddfeat_ILS(items, feattype)) | |
| 518 | + self.rdd_data = self.rdd_data.mapValues(lambda items: rdd.rddfeat_ILS(items, feattype)) | |
| 519 | 519 | |
| 520 | 520 | # print self.rdd_data.collect()[0] |
| 521 | 521 | # return |
| ... | ... | @@ -541,9 +541,9 @@ class DataILSVRC_S(DataDumperBase): |
| 541 | 541 | ] |
| 542 | 542 | |
| 543 | 543 | if readforward: |
| 544 | - self.rdd_data = self.sparker.read_hbase(self.table_name, func=SC.rddparse_all_ILS, collect=False) | |
| 544 | + self.rdd_data = self.sparker.read_hbase(self.table_name, func=rdd.rddparse_all_ILS, collect=False) | |
| 545 | 545 | |
| 546 | - self.rdd_data = self.rdd_data.mapValues(lambda items: SC.rddanalysis_ILS(items)) | |
| 546 | + self.rdd_data = self.rdd_data.mapValues(lambda items: rdd.rddanalysis_ILS(items)) | |
| 547 | 547 | |
| 548 | 548 | # print self.rdd_data.collect()[0] |
| 549 | 549 | # return |
| ... | ... | @@ -621,7 +621,7 @@ class DataILSVRC_S(DataDumperBase): |
| 621 | 621 | self.sparker = SC.Sparker(host='HPC-server', appname='ImageILSVRC-S', |
| 622 | 622 | master='spark://HPC-server:7077') |
| 623 | 623 | |
| 624 | - rdd_dataset = self.sparker.read_hbase(self.table_name, func=SC.rddparse_dataset_ILS, collect=False) | |
| 624 | + rdd_dataset = self.sparker.read_hbase(self.table_name, func=rdd.rddparse_dataset_ILS, collect=False) | |
| 625 | 625 | if not collect: |
| 626 | 626 | rdd_dataset = rdd_dataset.map(lambda x: LabeledPoint(x[0], x[1])) |
| 627 | 627 | return rdd_dataset | ... | ... |
mmodel/svm/SVM.py
| ... | ... | @@ -9,7 +9,7 @@ import os, sys |
| 9 | 9 | from ...mfeat import * |
| 10 | 10 | from ...mmodel import * |
| 11 | 11 | from ...mmodel.svm.svmutil import * |
| 12 | -from ...mspark import SC2 | |
| 12 | +from ...mspark import SC | |
| 13 | 13 | from ...common import * |
| 14 | 14 | |
| 15 | 15 | import numpy as np |
| ... | ... | @@ -191,7 +191,7 @@ class ModelSVM(ModelBase): |
| 191 | 191 | |
| 192 | 192 | def _train_spark(self, X, Y=None): |
| 193 | 193 | if self.sparker == None: |
| 194 | - self.sparker = SC2.Sparker(host='HPC-server', appname='ImageCV', master='spark://HPC-server:7077') | |
| 194 | + self.sparker = SC.Sparker(host='HPC-server', appname='ImageCV', master='spark://HPC-server:7077') | |
| 195 | 195 | |
| 196 | 196 | self.model = self.sparker.train_svm(X, Y) |
| 197 | 197 | ... | ... |
mspark/SC.py
| ... | ... | @@ -4,349 +4,20 @@ __author__ = 'chunk' |
| 4 | 4 | from ..common import * |
| 5 | 5 | from .dependencies import * |
| 6 | 6 | from . import * |
| 7 | -# from ..mdata import MSR, CV, ILSVRC, ILSVRC_S | |
| 8 | - | |
| 9 | -from ..mjpeg import * | |
| 10 | -from ..msteg import * | |
| 11 | -from ..msteg.steganography import LSB, F3, F4, F5 | |
| 12 | -from ..mfeat import IntraBlockDiff | |
| 13 | -from ..mmodel.svm import SVM2 | |
| 7 | +from .rdd import * | |
| 14 | 8 | |
| 15 | 9 | import sys |
| 16 | 10 | from pyspark import RDD |
| 17 | 11 | from pyspark import SparkConf, SparkContext |
| 18 | 12 | from pyspark.mllib.classification import LogisticRegressionWithSGD, SVMWithSGD |
| 19 | 13 | from pyspark.mllib.regression import LabeledPoint |
| 20 | -from numpy import array | |
| 21 | -import json | |
| 22 | -import pickle | |
| 23 | -import tempfile | |
| 14 | + | |
| 24 | 15 | |
| 25 | 16 | import numpy as np |
| 26 | -from scipy import stats | |
| 27 | -from hashlib import md5 | |
| 17 | + | |
| 28 | 18 | |
| 29 | 19 | np.random.seed(sum(map(ord, "whoami"))) |
| 30 | 20 | package_dir = os.path.dirname(os.path.abspath(__file__)) |
| 31 | -classifier = SVM2.ModelSVM(toolset='sklearn') | |
| 32 | - | |
| 33 | - | |
| 34 | -def rddparse_data_CV(raw_row): | |
| 35 | - """ | |
| 36 | - input: (u'key0',u'cf_feat:hog:[0.056273,...]--%--cf_pic:data:\ufffd\ufffd\...--%--cf_tag:hog:True') | |
| 37 | - return: ([0.056273,...],1) | |
| 38 | - """ | |
| 39 | - data = raw_row[1].split('--%--') | |
| 40 | - feat = json.loads(data[0].split(':')[-1]) | |
| 41 | - tag = 1 if data[-1].split(':')[-1] == 'True' else 0 | |
| 42 | - return (feat, tag) | |
| 43 | - | |
| 44 | - | |
| 45 | -def rddparse_data_ILS(raw_row): | |
| 46 | - """ | |
| 47 | - input: (u'key0',u'cf_feat:hog:[0.056273,...]--%--cf_pic:data:\ufffd\ufffd\...--%--cf_tag:hog:True') | |
| 48 | - return: ([0.056273,...],1) | |
| 49 | - | |
| 50 | - In fact we can also use mapValues. | |
| 51 | - """ | |
| 52 | - key = raw_row[0] | |
| 53 | - # if key == '04650c488a2b163ca8a1f52da6022f03.jpg': | |
| 54 | - # with open('/tmp/hhhh','wb') as f: | |
| 55 | - # f.write(raw_row[1].decode('unicode-escape')).encode('latin-1') | |
| 56 | - items = raw_row[1].decode('unicode-escape').encode('latin-1').split('--%--') | |
| 57 | - data = items[0].split('cf_pic:data:')[-1] | |
| 58 | - return (key, data) | |
| 59 | - | |
| 60 | - | |
| 61 | -def rddparse_all_ILS(raw_row): | |
| 62 | - """ | |
| 63 | - Deprecated | |
| 64 | - """ | |
| 65 | - key = raw_row[0] | |
| 66 | - items = raw_row[1].decode('unicode-escape').encode('latin-1').split('--%--') | |
| 67 | - | |
| 68 | - # @TODO | |
| 69 | - # N.B "ValueError: No JSON object could be decoded" Because the spark-hbase IO is based on strings. | |
| 70 | - # And the order of items is not as expected. See ../res/row-sample.txt or check in hbase shell for that. | |
| 71 | - | |
| 72 | - data = [items[0].split('cf_pic:data:')[-1]] + [json.loads(item.split(':')[-1]) for item in | |
| 73 | - items[1:]] | |
| 74 | - | |
| 75 | - return (key, data) | |
| 76 | - | |
| 77 | - | |
| 78 | -def rddparse_dataset_ILS(raw_row): | |
| 79 | - if raw_row[0] == '04650c488a2b163ca8a1f52da6022f03.jpg': | |
| 80 | - print raw_row | |
| 81 | - items = raw_row[1].decode('unicode-escape').encode('latin-1').split('--%--') | |
| 82 | - # tag = int(items[-2].split('cf_tag:' + tagtype)[-1]) | |
| 83 | - # feat = [item for sublist in json.loads(items[-1].split('cf_feat:' + feattype)[-1]) for subsublist in sublist for item in subsublist] | |
| 84 | - tag = int(items[-1].split(':')[-1]) | |
| 85 | - feat = [item for sublist in json.loads(items[0].split(':')[-1]) for subsublist in sublist for | |
| 86 | - item in subsublist] | |
| 87 | - | |
| 88 | - return (tag, feat) | |
| 89 | - | |
| 90 | - | |
| 91 | -def rddinfo_ILS(img, info_rate=None, tag_chosen=None, tag_class=None): | |
| 92 | - """ | |
| 93 | - Tempfile is our friend. (?) | |
| 94 | - """ | |
| 95 | - info_rate = info_rate if info_rate != None else 0.0 | |
| 96 | - tag_chosen = tag_chosen if tag_chosen != None else stats.bernoulli.rvs(0.8) | |
| 97 | - tag_class = tag_class if tag_class != None else 0 | |
| 98 | - try: | |
| 99 | - tmpf = tempfile.NamedTemporaryFile(suffix='.jpg', mode='w+b', delete=True) | |
| 100 | - tmpf.write(img) | |
| 101 | - tmpf.seek(0) | |
| 102 | - im = Jpeg(tmpf.name, key=sample_key) | |
| 103 | - info = [ | |
| 104 | - im.image_width, | |
| 105 | - im.image_height, | |
| 106 | - im.image_width * im.image_height, | |
| 107 | - im.getCapacity(), | |
| 108 | - im.getQuality(), | |
| 109 | - info_rate, | |
| 110 | - tag_chosen, | |
| 111 | - tag_class | |
| 112 | - ] | |
| 113 | - return info | |
| 114 | - except Exception as e: | |
| 115 | - print e | |
| 116 | - raise | |
| 117 | - finally: | |
| 118 | - tmpf.close() | |
| 119 | - | |
| 120 | - | |
| 121 | -def rddembed_ILS(row, rate=None): | |
| 122 | - """ | |
| 123 | - input: | |
| 124 | - e.g. row =('row1',[1,3400,'hello']) | |
| 125 | - return: | |
| 126 | - newrow = ('row2',[34,5400,'embeded']) | |
| 127 | - """ | |
| 128 | - items = row[1] | |
| 129 | - capacity, chosen = int(items[4]), int(items[7]) | |
| 130 | - if chosen == 0: | |
| 131 | - return None | |
| 132 | - try: | |
| 133 | - tmpf_src = tempfile.NamedTemporaryFile(suffix='.jpg', mode='w+b') | |
| 134 | - tmpf_src.write(items[0]) | |
| 135 | - tmpf_src.seek(0) | |
| 136 | - tmpf_dst = tempfile.NamedTemporaryFile(suffix='.jpg', mode='w+b') | |
| 137 | - | |
| 138 | - steger = F5.F5(sample_key, 1) | |
| 139 | - | |
| 140 | - if rate == None: | |
| 141 | - embed_rate = steger.embed_raw_data(tmpf_src.name, | |
| 142 | - os.path.join(package_dir, '../res/toembed'), | |
| 143 | - tmpf_dst.name) | |
| 144 | - else: | |
| 145 | - assert (rate >= 0 and rate < 1) | |
| 146 | - # print capacity | |
| 147 | - hidden = np.random.bytes(int(int(capacity) * rate) / 8) | |
| 148 | - embed_rate = steger.embed_raw_data(tmpf_src.name, hidden, tmpf_dst.name, frommem=True) | |
| 149 | - | |
| 150 | - tmpf_dst.seek(0) | |
| 151 | - raw = tmpf_dst.read() | |
| 152 | - index = md5(raw).hexdigest() | |
| 153 | - | |
| 154 | - return (index + '.jpg', [raw] + rddinfo_ILS(raw, embed_rate, 0, 1)) | |
| 155 | - | |
| 156 | - except Exception as e: | |
| 157 | - print e | |
| 158 | - raise | |
| 159 | - finally: | |
| 160 | - tmpf_src.close() | |
| 161 | - tmpf_dst.close() | |
| 162 | - | |
| 163 | - | |
| 164 | -def rddembed_ILS_EXT(row, rate=None): | |
| 165 | - """ | |
| 166 | - input: | |
| 167 | - e.g. row =('row1',[1,3400,'hello']) | |
| 168 | - return: | |
| 169 | - newrow = ('row2',[34,5400,'embeded']) or NULL | |
| 170 | - [row,newrow] | |
| 171 | - """ | |
| 172 | - items = row[1] | |
| 173 | - capacity, chosen = int(items[4]), int(items[7]) | |
| 174 | - if chosen == 0: | |
| 175 | - return [row] | |
| 176 | - try: | |
| 177 | - tmpf_src = tempfile.NamedTemporaryFile(suffix='.jpg', mode='w+b') | |
| 178 | - tmpf_src.write(items[0]) | |
| 179 | - tmpf_src.seek(0) | |
| 180 | - tmpf_dst = tempfile.NamedTemporaryFile(suffix='.jpg', mode='w+b') | |
| 181 | - | |
| 182 | - steger = F5.F5(sample_key, 2) | |
| 183 | - | |
| 184 | - if rate == None: | |
| 185 | - embed_rate = steger.embed_raw_data(tmpf_src.name, | |
| 186 | - os.path.join(package_dir, '../res/toembed'), | |
| 187 | - tmpf_dst.name) | |
| 188 | - else: | |
| 189 | - assert (rate >= 0 and rate < 1) | |
| 190 | - # print capacity | |
| 191 | - hidden = np.random.bytes(int(int(capacity) * rate) / 8) | |
| 192 | - embed_rate = steger.embed_raw_data(tmpf_src.name, hidden, tmpf_dst.name, frommem=True) | |
| 193 | - | |
| 194 | - tmpf_dst.seek(0) | |
| 195 | - raw = tmpf_dst.read() | |
| 196 | - index = md5(raw).hexdigest() | |
| 197 | - | |
| 198 | - return [row, (index + '.jpg', [raw] + rddinfo_ILS(raw, embed_rate, 0, 1))] | |
| 199 | - | |
| 200 | - except Exception as e: | |
| 201 | - print e | |
| 202 | - raise | |
| 203 | - finally: | |
| 204 | - tmpf_src.close() | |
| 205 | - tmpf_dst.close() | |
| 206 | - | |
| 207 | - | |
| 208 | -def _get_feat(image, feattype='ibd', **kwargs): | |
| 209 | - if feattype == 'ibd': | |
| 210 | - feater = IntraBlockDiff.FeatIntraBlockDiff() | |
| 211 | - else: | |
| 212 | - raise Exception("Unknown feature type!") | |
| 213 | - | |
| 214 | - desc = feater.feat(image) | |
| 215 | - | |
| 216 | - return desc | |
| 217 | - | |
| 218 | - | |
| 219 | -def rddfeat_ILS(items, feattype='ibd', **kwargs): | |
| 220 | - try: | |
| 221 | - tmpf_src = tempfile.NamedTemporaryFile(suffix='.jpg', mode='w+b') | |
| 222 | - tmpf_src.write(items[0]) | |
| 223 | - tmpf_src.seek(0) | |
| 224 | - | |
| 225 | - desc = json.dumps(_get_feat(tmpf_src.name, feattype=feattype).tolist()) | |
| 226 | - # print 'desccccccccccccccccccc',desc | |
| 227 | - return items + [desc] | |
| 228 | - | |
| 229 | - except Exception as e: | |
| 230 | - print e | |
| 231 | - raise | |
| 232 | - finally: | |
| 233 | - tmpf_src.close() | |
| 234 | - | |
| 235 | - | |
| 236 | -def rddanalysis_ILS(items, feattype='ibd', **kwargs): | |
| 237 | - head = np.fromstring(items[0][:2], dtype=np.uint8) | |
| 238 | - if not np.array_equal(head, [255, 216]): | |
| 239 | - return items + [0] | |
| 240 | - try: | |
| 241 | - tmpf_src = tempfile.NamedTemporaryFile(suffix='.jpg', mode='w+b') | |
| 242 | - tmpf_src.write(items[0]) | |
| 243 | - tmpf_src.seek(0) | |
| 244 | - | |
| 245 | - desc = _get_feat(tmpf_src.name, feattype=feattype) | |
| 246 | - tag = classifier.predict(desc.ravel())[0] | |
| 247 | - # print 'desccccccccccccccccccc',desc | |
| 248 | - return items + [tag] | |
| 249 | - | |
| 250 | - except Exception as e: | |
| 251 | - print e | |
| 252 | - raise | |
| 253 | - finally: | |
| 254 | - tmpf_src.close() | |
| 255 | - | |
| 256 | - # return items + classifier.predict(items[-1]) | |
| 257 | - | |
| 258 | - | |
| 259 | -def format_out(row, cols, withdata=False): | |
| 260 | - """ | |
| 261 | - input: | |
| 262 | - e.g. row =('row1',[1,3400,'hello']) | |
| 263 | - cols = [['cf_info', 'id'], ['cf_info', 'size'], ['cf_tag', 'desc']] | |
| 264 | - return: | |
| 265 | - [('row1',['row1', 'cf_info', 'id', '1']),('row1',['row1', 'cf_info', 'size', '3400']),('row1',['row1', 'cf_tag', 'desc', 'hello'])] | |
| 266 | - """ | |
| 267 | - puts = [] | |
| 268 | - key = row[0] | |
| 269 | - # if key == '04650c488a2b163ca8a1f52da6022f03.jpg': | |
| 270 | - # print row | |
| 271 | - if not withdata: | |
| 272 | - for data, col in zip(row[1][1:], cols[1:]): | |
| 273 | - puts.append((key, [key] + col + [str(data)])) | |
| 274 | - else: | |
| 275 | - for data, col in zip(row[1], cols): | |
| 276 | - puts.append((key, [key] + col + [str(data)])) | |
| 277 | - return puts | |
| 278 | - | |
| 279 | - | |
| 280 | -# scconf = SparkConf() | |
| 281 | -# scconf.setSparkHome("HPC-server") \ | |
| 282 | -# .setMaster("spark://HPC-server:7077") \ | |
| 283 | -# .setAppName("example") | |
| 284 | -# sc = SparkContext(conf=scconf) | |
| 285 | -# | |
| 286 | -# | |
| 287 | -# def read_hbase(table_name, func=None, collect=False): | |
| 288 | -# """ | |
| 289 | -# ref - http://happybase.readthedocs.org/en/latest/user.html#retrieving-data | |
| 290 | -# | |
| 291 | -# Filter format: | |
| 292 | -# columns=['cf1:col1', 'cf1:col2'] | |
| 293 | -# or | |
| 294 | -# columns=['cf1'] | |
| 295 | -# | |
| 296 | -# """ | |
| 297 | -# | |
| 298 | -# hconf = { | |
| 299 | -# "hbase.zookeeper.quorum": "HPC-server, HPC, HPC2", | |
| 300 | -# # "hbase.zookeeper.quorum": self.host, | |
| 301 | -# "hbase.mapreduce.inputtable": table_name, | |
| 302 | -# } | |
| 303 | -# | |
| 304 | -# hbase_rdd = sc.newAPIHadoopRDD(inputFormatClass=hparams["inputFormatClass"], | |
| 305 | -# keyClass=hparams["readKeyClass"], | |
| 306 | -# valueClass=hparams["readValueClass"], | |
| 307 | -# keyConverter=hparams["readKeyConverter"], | |
| 308 | -# valueConverter=hparams["readValueConverter"], | |
| 309 | -# conf=hconf) | |
| 310 | -# | |
| 311 | -# parser = func if func != None else rddparse_data_CV | |
| 312 | -# hbase_rdd = hbase_rdd.map(lambda x: parser(x)) | |
| 313 | -# | |
| 314 | -# if collect: | |
| 315 | -# return hbase_rdd.collect() | |
| 316 | -# else: | |
| 317 | -# return hbase_rdd | |
| 318 | -# | |
| 319 | -# | |
| 320 | -# def write_hbase(table_name, data, fromrdd=False, columns=None, withdata=False): | |
| 321 | -# """ | |
| 322 | -# Data Format: (Deprecated) | |
| 323 | -# e.g. [["row8", "f1", "", "caocao cao"], ["row9", "f1", "c1", "asdfg hhhh"]] | |
| 324 | -# | |
| 325 | -# Data(from dictionary): | |
| 326 | -# e.g. data ={'row1':[1,3400,'hello'], 'row2':[34,5000,'here in mine']}, | |
| 327 | -# cols = ['cf_info:id', 'cf_info:size', 'cf_tag:desc'] | |
| 328 | -# Data(from Rdd): | |
| 329 | -# e.g. data =[('row1',[1,3400,'hello']), ('row2',[34,5000,'here in mine'])], | |
| 330 | -# cols = ['cf_info:id', 'cf_info:size', 'cf_tag:desc'] | |
| 331 | -# """ | |
| 332 | -# hconf = { | |
| 333 | -# "hbase.zookeeper.quorum": "HPC-server, HPC, HPC2", # "hbase.zookeeper.quorum": self.host, | |
| 334 | -# "hbase.mapreduce.inputtable": table_name, | |
| 335 | -# "hbase.mapred.outputtable": table_name, | |
| 336 | -# "mapreduce.outputformat.class": hparams["outputFormatClass"], | |
| 337 | -# "mapreduce.job.output.key.class": hparams["writeKeyClass"], | |
| 338 | -# "mapreduce.job.output.value.class": hparams["writeValueClass"], | |
| 339 | -# } | |
| 340 | -# cols = [col.split(':') for col in columns] | |
| 341 | -# if not fromrdd: | |
| 342 | -# rdd_data = sc.parallelize(data) | |
| 343 | -# else: | |
| 344 | -# rdd_data = data | |
| 345 | -# | |
| 346 | -# rdd_data.flatMap(lambda x: format_out(x, cols, withdata=withdata)).saveAsNewAPIHadoopDataset( | |
| 347 | -# conf=hconf, | |
| 348 | -# keyConverter=hparams["writeKeyConverter"], | |
| 349 | -# valueConverter=hparams["writeValueConverter"]) | |
| 350 | 21 | |
| 351 | 22 | |
| 352 | 23 | class Sparker(object): | ... | ... |
| ... | ... | @@ -0,0 +1,267 @@ |
| 1 | +__author__ = 'hadoop' | |
| 2 | + | |
| 3 | +from ..common import * | |
| 4 | + | |
| 5 | +from ..mjpeg import * | |
| 6 | +from ..msteg import * | |
| 7 | +from ..msteg.steganography import LSB, F3, F4, F5 | |
| 8 | +from ..mfeat import IntraBlockDiff | |
| 9 | +from ..mmodel.svm import SVM | |
| 10 | + | |
| 11 | +from numpy import array | |
| 12 | +import json | |
| 13 | +import pickle | |
| 14 | +import tempfile | |
| 15 | + | |
| 16 | +import numpy as np | |
| 17 | +from scipy import stats | |
| 18 | +from hashlib import md5 | |
| 19 | + | |
| 20 | +np.random.seed(sum(map(ord, "whoami"))) | |
| 21 | +package_dir = os.path.dirname(os.path.abspath(__file__)) | |
| 22 | +classifier = SVM.ModelSVM(toolset='sklearn') | |
| 23 | + | |
| 24 | +def rddparse_data_CV(raw_row): | |
| 25 | + """ | |
| 26 | + input: (u'key0',u'cf_feat:hog:[0.056273,...]--%--cf_pic:data:\ufffd\ufffd\...--%--cf_tag:hog:True') | |
| 27 | + return: ([0.056273,...],1) | |
| 28 | + """ | |
| 29 | + data = raw_row[1].split('--%--') | |
| 30 | + feat = json.loads(data[0].split(':')[-1]) | |
| 31 | + tag = 1 if data[-1].split(':')[-1] == 'True' else 0 | |
| 32 | + return (feat, tag) | |
| 33 | + | |
| 34 | + | |
| 35 | +def rddparse_data_ILS(raw_row): | |
| 36 | + """ | |
| 37 | + input: (u'key0',u'cf_feat:hog:[0.056273,...]--%--cf_pic:data:\ufffd\ufffd\...--%--cf_tag:hog:True') | |
| 38 | + return: ([0.056273,...],1) | |
| 39 | + | |
| 40 | + In fact we can also use mapValues. | |
| 41 | + """ | |
| 42 | + key = raw_row[0] | |
| 43 | + # if key == '04650c488a2b163ca8a1f52da6022f03.jpg': | |
| 44 | + # with open('/tmp/hhhh','wb') as f: | |
| 45 | + # f.write(raw_row[1].decode('unicode-escape')).encode('latin-1') | |
| 46 | + items = raw_row[1].decode('unicode-escape').encode('latin-1').split('--%--') | |
| 47 | + data = items[0].split('cf_pic:data:')[-1] | |
| 48 | + return (key, data) | |
| 49 | + | |
| 50 | + | |
| 51 | +def rddparse_all_ILS(raw_row): | |
| 52 | + """ | |
| 53 | + Deprecated | |
| 54 | + """ | |
| 55 | + key = raw_row[0] | |
| 56 | + items = raw_row[1].decode('unicode-escape').encode('latin-1').split('--%--') | |
| 57 | + | |
| 58 | + # @TODO | |
| 59 | + # N.B "ValueError: No JSON object could be decoded" Because the spark-hbase IO is based on strings. | |
| 60 | + # And the order of items is not as expected. See ../res/row-sample.txt or check in hbase shell for that. | |
| 61 | + | |
| 62 | + data = [items[0].split('cf_pic:data:')[-1]] + [json.loads(item.split(':')[-1]) for item in | |
| 63 | + items[1:]] | |
| 64 | + | |
| 65 | + return (key, data) | |
| 66 | + | |
| 67 | + | |
| 68 | +def rddparse_dataset_ILS(raw_row): | |
| 69 | + if raw_row[0] == '04650c488a2b163ca8a1f52da6022f03.jpg': | |
| 70 | + print raw_row | |
| 71 | + items = raw_row[1].decode('unicode-escape').encode('latin-1').split('--%--') | |
| 72 | + # tag = int(items[-2].split('cf_tag:' + tagtype)[-1]) | |
| 73 | + # feat = [item for sublist in json.loads(items[-1].split('cf_feat:' + feattype)[-1]) for subsublist in sublist for item in subsublist] | |
| 74 | + tag = int(items[-1].split(':')[-1]) | |
| 75 | + feat = [item for sublist in json.loads(items[0].split(':')[-1]) for subsublist in sublist for | |
| 76 | + item in subsublist] | |
| 77 | + | |
| 78 | + return (tag, feat) | |
| 79 | + | |
| 80 | + | |
| 81 | +def rddinfo_ILS(img, info_rate=None, tag_chosen=None, tag_class=None): | |
| 82 | + """ | |
| 83 | + Tempfile is our friend. (?) | |
| 84 | + """ | |
| 85 | + info_rate = info_rate if info_rate != None else 0.0 | |
| 86 | + tag_chosen = tag_chosen if tag_chosen != None else stats.bernoulli.rvs(0.8) | |
| 87 | + tag_class = tag_class if tag_class != None else 0 | |
| 88 | + try: | |
| 89 | + tmpf = tempfile.NamedTemporaryFile(suffix='.jpg', mode='w+b', delete=True) | |
| 90 | + tmpf.write(img) | |
| 91 | + tmpf.seek(0) | |
| 92 | + im = Jpeg(tmpf.name, key=sample_key) | |
| 93 | + info = [ | |
| 94 | + im.image_width, | |
| 95 | + im.image_height, | |
| 96 | + im.image_width * im.image_height, | |
| 97 | + im.getCapacity(), | |
| 98 | + im.getQuality(), | |
| 99 | + info_rate, | |
| 100 | + tag_chosen, | |
| 101 | + tag_class | |
| 102 | + ] | |
| 103 | + return info | |
| 104 | + except Exception as e: | |
| 105 | + print e | |
| 106 | + raise | |
| 107 | + finally: | |
| 108 | + tmpf.close() | |
| 109 | + | |
| 110 | + | |
| 111 | +def rddembed_ILS(row, rate=None): | |
| 112 | + """ | |
| 113 | + input: | |
| 114 | + e.g. row =('row1',[1,3400,'hello']) | |
| 115 | + return: | |
| 116 | + newrow = ('row2',[34,5400,'embeded']) | |
| 117 | + """ | |
| 118 | + items = row[1] | |
| 119 | + capacity, chosen = int(items[4]), int(items[7]) | |
| 120 | + if chosen == 0: | |
| 121 | + return None | |
| 122 | + try: | |
| 123 | + tmpf_src = tempfile.NamedTemporaryFile(suffix='.jpg', mode='w+b') | |
| 124 | + tmpf_src.write(items[0]) | |
| 125 | + tmpf_src.seek(0) | |
| 126 | + tmpf_dst = tempfile.NamedTemporaryFile(suffix='.jpg', mode='w+b') | |
| 127 | + | |
| 128 | + steger = F5.F5(sample_key, 1) | |
| 129 | + | |
| 130 | + if rate == None: | |
| 131 | + embed_rate = steger.embed_raw_data(tmpf_src.name, | |
| 132 | + os.path.join(package_dir, '../res/toembed'), | |
| 133 | + tmpf_dst.name) | |
| 134 | + else: | |
| 135 | + assert (rate >= 0 and rate < 1) | |
| 136 | + # print capacity | |
| 137 | + hidden = np.random.bytes(int(int(capacity) * rate) / 8) | |
| 138 | + embed_rate = steger.embed_raw_data(tmpf_src.name, hidden, tmpf_dst.name, frommem=True) | |
| 139 | + | |
| 140 | + tmpf_dst.seek(0) | |
| 141 | + raw = tmpf_dst.read() | |
| 142 | + index = md5(raw).hexdigest() | |
| 143 | + | |
| 144 | + return (index + '.jpg', [raw] + rddinfo_ILS(raw, embed_rate, 0, 1)) | |
| 145 | + | |
| 146 | + except Exception as e: | |
| 147 | + print e | |
| 148 | + raise | |
| 149 | + finally: | |
| 150 | + tmpf_src.close() | |
| 151 | + tmpf_dst.close() | |
| 152 | + | |
| 153 | + | |
| 154 | +def rddembed_ILS_EXT(row, rate=None): | |
| 155 | + """ | |
| 156 | + input: | |
| 157 | + e.g. row =('row1',[1,3400,'hello']) | |
| 158 | + return: | |
| 159 | + newrow = ('row2',[34,5400,'embeded']) or NULL | |
| 160 | + [row,newrow] | |
| 161 | + """ | |
| 162 | + items = row[1] | |
| 163 | + capacity, chosen = int(items[4]), int(items[7]) | |
| 164 | + if chosen == 0: | |
| 165 | + return [row] | |
| 166 | + try: | |
| 167 | + tmpf_src = tempfile.NamedTemporaryFile(suffix='.jpg', mode='w+b') | |
| 168 | + tmpf_src.write(items[0]) | |
| 169 | + tmpf_src.seek(0) | |
| 170 | + tmpf_dst = tempfile.NamedTemporaryFile(suffix='.jpg', mode='w+b') | |
| 171 | + | |
| 172 | + steger = F5.F5(sample_key, 2) | |
| 173 | + | |
| 174 | + if rate == None: | |
| 175 | + embed_rate = steger.embed_raw_data(tmpf_src.name, | |
| 176 | + os.path.join(package_dir, '../res/toembed'), | |
| 177 | + tmpf_dst.name) | |
| 178 | + else: | |
| 179 | + assert (rate >= 0 and rate < 1) | |
| 180 | + # print capacity | |
| 181 | + hidden = np.random.bytes(int(int(capacity) * rate) / 8) | |
| 182 | + embed_rate = steger.embed_raw_data(tmpf_src.name, hidden, tmpf_dst.name, frommem=True) | |
| 183 | + | |
| 184 | + tmpf_dst.seek(0) | |
| 185 | + raw = tmpf_dst.read() | |
| 186 | + index = md5(raw).hexdigest() | |
| 187 | + | |
| 188 | + return [row, (index + '.jpg', [raw] + rddinfo_ILS(raw, embed_rate, 0, 1))] | |
| 189 | + | |
| 190 | + except Exception as e: | |
| 191 | + print e | |
| 192 | + raise | |
| 193 | + finally: | |
| 194 | + tmpf_src.close() | |
| 195 | + tmpf_dst.close() | |
| 196 | + | |
| 197 | + | |
| 198 | +def _get_feat(image, feattype='ibd', **kwargs): | |
| 199 | + if feattype == 'ibd': | |
| 200 | + feater = IntraBlockDiff.FeatIntraBlockDiff() | |
| 201 | + else: | |
| 202 | + raise Exception("Unknown feature type!") | |
| 203 | + | |
| 204 | + desc = feater.feat(image) | |
| 205 | + | |
| 206 | + return desc | |
| 207 | + | |
| 208 | + | |
| 209 | +def rddfeat_ILS(items, feattype='ibd', **kwargs): | |
| 210 | + try: | |
| 211 | + tmpf_src = tempfile.NamedTemporaryFile(suffix='.jpg', mode='w+b') | |
| 212 | + tmpf_src.write(items[0]) | |
| 213 | + tmpf_src.seek(0) | |
| 214 | + | |
| 215 | + desc = json.dumps(_get_feat(tmpf_src.name, feattype=feattype).tolist()) | |
| 216 | + # print 'desccccccccccccccccccc',desc | |
| 217 | + return items + [desc] | |
| 218 | + | |
| 219 | + except Exception as e: | |
| 220 | + print e | |
| 221 | + raise | |
| 222 | + finally: | |
| 223 | + tmpf_src.close() | |
| 224 | + | |
| 225 | + | |
| 226 | +def rddanalysis_ILS(items, feattype='ibd', **kwargs): | |
| 227 | + head = np.fromstring(items[0][:2], dtype=np.uint8) | |
| 228 | + if not np.array_equal(head, [255, 216]): | |
| 229 | + return items + [0] | |
| 230 | + try: | |
| 231 | + tmpf_src = tempfile.NamedTemporaryFile(suffix='.jpg', mode='w+b') | |
| 232 | + tmpf_src.write(items[0]) | |
| 233 | + tmpf_src.seek(0) | |
| 234 | + | |
| 235 | + desc = _get_feat(tmpf_src.name, feattype=feattype) | |
| 236 | + tag = classifier.predict(desc.ravel())[0] | |
| 237 | + # print 'desccccccccccccccccccc',desc | |
| 238 | + return items + [tag] | |
| 239 | + | |
| 240 | + except Exception as e: | |
| 241 | + print e | |
| 242 | + raise | |
| 243 | + finally: | |
| 244 | + tmpf_src.close() | |
| 245 | + | |
| 246 | + # return items + classifier.predict(items[-1]) | |
| 247 | + | |
| 248 | + | |
| 249 | +def format_out(row, cols, withdata=False): | |
| 250 | + """ | |
| 251 | + input: | |
| 252 | + e.g. row =('row1',[1,3400,'hello']) | |
| 253 | + cols = [['cf_info', 'id'], ['cf_info', 'size'], ['cf_tag', 'desc']] | |
| 254 | + return: | |
| 255 | + [('row1',['row1', 'cf_info', 'id', '1']),('row1',['row1', 'cf_info', 'size', '3400']),('row1',['row1', 'cf_tag', 'desc', 'hello'])] | |
| 256 | + """ | |
| 257 | + puts = [] | |
| 258 | + key = row[0] | |
| 259 | + # if key == '04650c488a2b163ca8a1f52da6022f03.jpg': | |
| 260 | + # print row | |
| 261 | + if not withdata: | |
| 262 | + for data, col in zip(row[1][1:], cols[1:]): | |
| 263 | + puts.append((key, [key] + col + [str(data)])) | |
| 264 | + else: | |
| 265 | + for data, col in zip(row[1], cols): | |
| 266 | + puts.append((key, [key] + col + [str(data)])) | |
| 267 | + return puts | ... | ... |