Commit 712861f0014b007d5e6bf52ced1d57d2ff644ecc
1 parent
f005aa4a
Exists in
refactor
extract rdd from SC.
Showing
5 changed files
with
309 additions
and
371 deletions
Show diff stats
mdata/ILSVRC.py
1 | __author__ = 'chunk' | 1 | __author__ = 'chunk' |
2 | 2 | ||
3 | from . import * | 3 | from . import * |
4 | -from ..mfeat import HOG, IntraBlockDiff | ||
5 | -from ..mspark import SC | 4 | +from ..mfeat import IntraBlockDiff |
5 | +from ..mspark import rdd, SC | ||
6 | from ..common import * | 6 | from ..common import * |
7 | 7 | ||
8 | import os, sys | 8 | import os, sys |
@@ -83,11 +83,11 @@ class DataILSVRC(DataDumperBase): | @@ -83,11 +83,11 @@ class DataILSVRC(DataDumperBase): | ||
83 | pass | 83 | pass |
84 | 84 | ||
85 | def get_feat(self, image, feattype='ibd', **kwargs): | 85 | def get_feat(self, image, feattype='ibd', **kwargs): |
86 | - size = kwargs.get('size', (48, 48)) | ||
87 | - | ||
88 | - if feattype == 'hog': | ||
89 | - feater = HOG.FeatHOG(size=size) | ||
90 | - elif feattype == 'ibd': | 86 | + # size = kwargs.get('size', (48, 48)) |
87 | + # | ||
88 | + # if feattype == 'hog': | ||
89 | + # feater = HOG.FeatHOG(size=size) | ||
90 | + if feattype == 'ibd': | ||
91 | feater = IntraBlockDiff.FeatIntraBlockDiff() | 91 | feater = IntraBlockDiff.FeatIntraBlockDiff() |
92 | else: | 92 | else: |
93 | raise Exception("Unknown feature type!") | 93 | raise Exception("Unknown feature type!") |
@@ -99,9 +99,9 @@ class DataILSVRC(DataDumperBase): | @@ -99,9 +99,9 @@ class DataILSVRC(DataDumperBase): | ||
99 | 99 | ||
100 | def extract_feat(self, feattype='ibd'): | 100 | def extract_feat(self, feattype='ibd'): |
101 | print "extracting feat..." | 101 | print "extracting feat..." |
102 | - if feattype == 'hog': | ||
103 | - feater = HOG.FeatHOG(size=(48, 48)) | ||
104 | - elif feattype == 'ibd': | 102 | + # if feattype == 'hog': |
103 | + # feater = HOG.FeatHOG(size=(48, 48)) | ||
104 | + if feattype == 'ibd': | ||
105 | feater = IntraBlockDiff.FeatIntraBlockDiff() | 105 | feater = IntraBlockDiff.FeatIntraBlockDiff() |
106 | else: | 106 | else: |
107 | raise Exception("Unknown feature type!") | 107 | raise Exception("Unknown feature type!") |
@@ -307,7 +307,7 @@ class DataILSVRC(DataDumperBase): | @@ -307,7 +307,7 @@ class DataILSVRC(DataDumperBase): | ||
307 | # cv2.imwrite(os.path.join(base_dir, category + '_crop_cv', name), img_crop) | 307 | # cv2.imwrite(os.path.join(base_dir, category + '_crop_cv', name), img_crop) |
308 | # except Exception as e: | 308 | # except Exception as e: |
309 | # print '[EXCPT]', e | 309 | # print '[EXCPT]', e |
310 | - # pass | 310 | + # pass |
311 | 311 | ||
312 | 312 | ||
313 | def get_table(self): | 313 | def get_table(self): |
@@ -322,10 +322,10 @@ class DataILSVRC(DataDumperBase): | @@ -322,10 +322,10 @@ class DataILSVRC(DataDumperBase): | ||
322 | tables = self.connection.tables() | 322 | tables = self.connection.tables() |
323 | if self.table_name not in tables: | 323 | if self.table_name not in tables: |
324 | families_compressed = {'cf_pic': dict(compression='LZO'), | 324 | families_compressed = {'cf_pic': dict(compression='LZO'), |
325 | - 'cf_info': dict(max_versions=10,compression='LZO'), | ||
326 | - 'cf_tag': dict(compression='LZO'), | ||
327 | - 'cf_feat': dict(compression='LZO'), | ||
328 | - } | 325 | + 'cf_info': dict(max_versions=10, compression='LZO'), |
326 | + 'cf_tag': dict(compression='LZO'), | ||
327 | + 'cf_feat': dict(compression='LZO'), | ||
328 | + } | ||
329 | families = {'cf_pic': dict(), | 329 | families = {'cf_pic': dict(), |
330 | 'cf_info': dict(max_versions=10), | 330 | 'cf_info': dict(max_versions=10), |
331 | 'cf_tag': dict(), | 331 | 'cf_tag': dict(), |
mdata/ILSVRC_S.py
1 | __author__ = 'chunk' | 1 | __author__ = 'chunk' |
2 | 2 | ||
3 | from . import * | 3 | from . import * |
4 | -from ..mfeat import HOG, IntraBlockDiff | ||
5 | -from ..mspark import SC | 4 | +from ..mfeat import IntraBlockDiff |
5 | +from ..mspark import rdd, SC | ||
6 | from pyspark.mllib.regression import LabeledPoint | 6 | from pyspark.mllib.regression import LabeledPoint |
7 | from ..common import * | 7 | from ..common import * |
8 | 8 | ||
@@ -135,11 +135,11 @@ class DataILSVRC_S(DataDumperBase): | @@ -135,11 +135,11 @@ class DataILSVRC_S(DataDumperBase): | ||
135 | tmpf.close() | 135 | tmpf.close() |
136 | 136 | ||
137 | def _get_feat(self, image, feattype='ibd', **kwargs): | 137 | def _get_feat(self, image, feattype='ibd', **kwargs): |
138 | - size = kwargs.get('size', (48, 48)) | ||
139 | - | ||
140 | - if feattype == 'hog': | ||
141 | - feater = HOG.FeatHOG(size=size) | ||
142 | - elif feattype == 'ibd': | 138 | + # size = kwargs.get('size', (48, 48)) |
139 | + # | ||
140 | + # if feattype == 'hog': | ||
141 | + # feater = HOG.FeatHOG(size=size) | ||
142 | + if feattype == 'ibd': | ||
143 | feater = IntraBlockDiff.FeatIntraBlockDiff() | 143 | feater = IntraBlockDiff.FeatIntraBlockDiff() |
144 | else: | 144 | else: |
145 | raise Exception("Unknown feature type!") | 145 | raise Exception("Unknown feature type!") |
@@ -267,16 +267,16 @@ class DataILSVRC_S(DataDumperBase): | @@ -267,16 +267,16 @@ class DataILSVRC_S(DataDumperBase): | ||
267 | ] | 267 | ] |
268 | 268 | ||
269 | # # Debug | 269 | # # Debug |
270 | - # tmp_data = self.sparker.read_hbase(self.table_name, func=SC.rddparse_data_ILS, | 270 | + # tmp_data = self.sparker.read_hbase(self.table_name, func=rdd.rddparse_data_ILS, |
271 | # collect=False) | 271 | # collect=False) |
272 | - # # tmp_data = tmp_data.mapValues(lambda data: [data] + SC.rddinfo_ILS(data)) | 272 | + # # tmp_data = tmp_data.mapValues(lambda data: [data] + rdd.rddinfo_ILS(data)) |
273 | # print tmp_data.collect()[0][1] | 273 | # print tmp_data.collect()[0][1] |
274 | # return | 274 | # return |
275 | 275 | ||
276 | 276 | ||
277 | - self.rdd_data = self.sparker.read_hbase(self.table_name, func=SC.rddparse_data_ILS, | 277 | + self.rdd_data = self.sparker.read_hbase(self.table_name, func=rdd.rddparse_data_ILS, |
278 | collect=False).mapValues( | 278 | collect=False).mapValues( |
279 | - lambda data: [data] + SC.rddinfo_ILS(data)) | 279 | + lambda data: [data] + rdd.rddinfo_ILS(data)) |
280 | 280 | ||
281 | if not writeback: | 281 | if not writeback: |
282 | return self.rdd_data | 282 | return self.rdd_data |
@@ -293,14 +293,14 @@ class DataILSVRC_S(DataDumperBase): | @@ -293,14 +293,14 @@ class DataILSVRC_S(DataDumperBase): | ||
293 | ] | 293 | ] |
294 | 294 | ||
295 | # # Debug | 295 | # # Debug |
296 | - # tmp_data = self.sparker.read_hbase(self.table_name, func=SC.rddparse_data_ILS, | 296 | + # tmp_data = self.sparker.read_hbase(self.table_name, func=rdd.rddparse_data_ILS, |
297 | # collect=False) | 297 | # collect=False) |
298 | - # # tmp_data = tmp_data.mapValues(lambda data: [data] + SC.rddinfo_ILS(data)) | 298 | + # # tmp_data = tmp_data.mapValues(lambda data: [data] + rdd.rddinfo_ILS(data)) |
299 | # print tmp_data.collect()[0][1] | 299 | # print tmp_data.collect()[0][1] |
300 | # return | 300 | # return |
301 | 301 | ||
302 | 302 | ||
303 | - self.rdd_data = self.sparker.read_hbase(self.table_name, func=SC.rddparse_data_ILS, | 303 | + self.rdd_data = self.sparker.read_hbase(self.table_name, func=rdd.rddparse_data_ILS, |
304 | collect=False).mapValues( | 304 | collect=False).mapValues( |
305 | lambda data: [data]) | 305 | lambda data: [data]) |
306 | 306 | ||
@@ -417,12 +417,12 @@ class DataILSVRC_S(DataDumperBase): | @@ -417,12 +417,12 @@ class DataILSVRC_S(DataDumperBase): | ||
417 | ] | 417 | ] |
418 | 418 | ||
419 | if readforward: | 419 | if readforward: |
420 | - self.rdd_data = self.sparker.read_hbase(self.table_name, func=SC.rddparse_all_ILS, collect=False) | 420 | + self.rdd_data = self.sparker.read_hbase(self.table_name, func=rdd.rddparse_all_ILS, collect=False) |
421 | 421 | ||
422 | - # rdd_data_ext = self.rdd_data.map(lambda x: SC.rddembed_ILS(x, rate=rate)).filter(lambda x: x != None) | 422 | + # rdd_data_ext = self.rdd_data.map(lambda x: rdd.rddembed_ILS(x, rate=rate)).filter(lambda x: x != None) |
423 | # self.rdd_data = self.rdd_data.union(rdd_data_ext) | 423 | # self.rdd_data = self.rdd_data.union(rdd_data_ext) |
424 | 424 | ||
425 | - self.rdd_data = self.rdd_data.flatMap(lambda x: SC.rddembed_ILS_EXT(x, rate=rate)) | 425 | + self.rdd_data = self.rdd_data.flatMap(lambda x: rdd.rddembed_ILS_EXT(x, rate=rate)) |
426 | if not writeback: | 426 | if not writeback: |
427 | return self.rdd_data | 427 | return self.rdd_data |
428 | else: | 428 | else: |
@@ -513,9 +513,9 @@ class DataILSVRC_S(DataDumperBase): | @@ -513,9 +513,9 @@ class DataILSVRC_S(DataDumperBase): | ||
513 | ] | 513 | ] |
514 | 514 | ||
515 | if readforward: | 515 | if readforward: |
516 | - self.rdd_data = self.sparker.read_hbase(self.table_name, func=SC.rddparse_all_ILS, collect=False) | 516 | + self.rdd_data = self.sparker.read_hbase(self.table_name, func=rdd.rddparse_all_ILS, collect=False) |
517 | 517 | ||
518 | - self.rdd_data = self.rdd_data.mapValues(lambda items: SC.rddfeat_ILS(items, feattype)) | 518 | + self.rdd_data = self.rdd_data.mapValues(lambda items: rdd.rddfeat_ILS(items, feattype)) |
519 | 519 | ||
520 | # print self.rdd_data.collect()[0] | 520 | # print self.rdd_data.collect()[0] |
521 | # return | 521 | # return |
@@ -541,9 +541,9 @@ class DataILSVRC_S(DataDumperBase): | @@ -541,9 +541,9 @@ class DataILSVRC_S(DataDumperBase): | ||
541 | ] | 541 | ] |
542 | 542 | ||
543 | if readforward: | 543 | if readforward: |
544 | - self.rdd_data = self.sparker.read_hbase(self.table_name, func=SC.rddparse_all_ILS, collect=False) | 544 | + self.rdd_data = self.sparker.read_hbase(self.table_name, func=rdd.rddparse_all_ILS, collect=False) |
545 | 545 | ||
546 | - self.rdd_data = self.rdd_data.mapValues(lambda items: SC.rddanalysis_ILS(items)) | 546 | + self.rdd_data = self.rdd_data.mapValues(lambda items: rdd.rddanalysis_ILS(items)) |
547 | 547 | ||
548 | # print self.rdd_data.collect()[0] | 548 | # print self.rdd_data.collect()[0] |
549 | # return | 549 | # return |
@@ -621,7 +621,7 @@ class DataILSVRC_S(DataDumperBase): | @@ -621,7 +621,7 @@ class DataILSVRC_S(DataDumperBase): | ||
621 | self.sparker = SC.Sparker(host='HPC-server', appname='ImageILSVRC-S', | 621 | self.sparker = SC.Sparker(host='HPC-server', appname='ImageILSVRC-S', |
622 | master='spark://HPC-server:7077') | 622 | master='spark://HPC-server:7077') |
623 | 623 | ||
624 | - rdd_dataset = self.sparker.read_hbase(self.table_name, func=SC.rddparse_dataset_ILS, collect=False) | 624 | + rdd_dataset = self.sparker.read_hbase(self.table_name, func=rdd.rddparse_dataset_ILS, collect=False) |
625 | if not collect: | 625 | if not collect: |
626 | rdd_dataset = rdd_dataset.map(lambda x: LabeledPoint(x[0], x[1])) | 626 | rdd_dataset = rdd_dataset.map(lambda x: LabeledPoint(x[0], x[1])) |
627 | return rdd_dataset | 627 | return rdd_dataset |
mmodel/svm/SVM.py
@@ -9,7 +9,7 @@ import os, sys | @@ -9,7 +9,7 @@ import os, sys | ||
9 | from ...mfeat import * | 9 | from ...mfeat import * |
10 | from ...mmodel import * | 10 | from ...mmodel import * |
11 | from ...mmodel.svm.svmutil import * | 11 | from ...mmodel.svm.svmutil import * |
12 | -from ...mspark import SC2 | 12 | +from ...mspark import SC |
13 | from ...common import * | 13 | from ...common import * |
14 | 14 | ||
15 | import numpy as np | 15 | import numpy as np |
@@ -191,7 +191,7 @@ class ModelSVM(ModelBase): | @@ -191,7 +191,7 @@ class ModelSVM(ModelBase): | ||
191 | 191 | ||
192 | def _train_spark(self, X, Y=None): | 192 | def _train_spark(self, X, Y=None): |
193 | if self.sparker == None: | 193 | if self.sparker == None: |
194 | - self.sparker = SC2.Sparker(host='HPC-server', appname='ImageCV', master='spark://HPC-server:7077') | 194 | + self.sparker = SC.Sparker(host='HPC-server', appname='ImageCV', master='spark://HPC-server:7077') |
195 | 195 | ||
196 | self.model = self.sparker.train_svm(X, Y) | 196 | self.model = self.sparker.train_svm(X, Y) |
197 | 197 |
mspark/SC.py
@@ -4,349 +4,20 @@ __author__ = 'chunk' | @@ -4,349 +4,20 @@ __author__ = 'chunk' | ||
4 | from ..common import * | 4 | from ..common import * |
5 | from .dependencies import * | 5 | from .dependencies import * |
6 | from . import * | 6 | from . import * |
7 | -# from ..mdata import MSR, CV, ILSVRC, ILSVRC_S | ||
8 | - | ||
9 | -from ..mjpeg import * | ||
10 | -from ..msteg import * | ||
11 | -from ..msteg.steganography import LSB, F3, F4, F5 | ||
12 | -from ..mfeat import IntraBlockDiff | ||
13 | -from ..mmodel.svm import SVM2 | 7 | +from .rdd import * |
14 | 8 | ||
15 | import sys | 9 | import sys |
16 | from pyspark import RDD | 10 | from pyspark import RDD |
17 | from pyspark import SparkConf, SparkContext | 11 | from pyspark import SparkConf, SparkContext |
18 | from pyspark.mllib.classification import LogisticRegressionWithSGD, SVMWithSGD | 12 | from pyspark.mllib.classification import LogisticRegressionWithSGD, SVMWithSGD |
19 | from pyspark.mllib.regression import LabeledPoint | 13 | from pyspark.mllib.regression import LabeledPoint |
20 | -from numpy import array | ||
21 | -import json | ||
22 | -import pickle | ||
23 | -import tempfile | 14 | + |
24 | 15 | ||
25 | import numpy as np | 16 | import numpy as np |
26 | -from scipy import stats | ||
27 | -from hashlib import md5 | 17 | + |
28 | 18 | ||
29 | np.random.seed(sum(map(ord, "whoami"))) | 19 | np.random.seed(sum(map(ord, "whoami"))) |
30 | package_dir = os.path.dirname(os.path.abspath(__file__)) | 20 | package_dir = os.path.dirname(os.path.abspath(__file__)) |
31 | -classifier = SVM2.ModelSVM(toolset='sklearn') | ||
32 | - | ||
33 | - | ||
34 | -def rddparse_data_CV(raw_row): | ||
35 | - """ | ||
36 | - input: (u'key0',u'cf_feat:hog:[0.056273,...]--%--cf_pic:data:\ufffd\ufffd\...--%--cf_tag:hog:True') | ||
37 | - return: ([0.056273,...],1) | ||
38 | - """ | ||
39 | - data = raw_row[1].split('--%--') | ||
40 | - feat = json.loads(data[0].split(':')[-1]) | ||
41 | - tag = 1 if data[-1].split(':')[-1] == 'True' else 0 | ||
42 | - return (feat, tag) | ||
43 | - | ||
44 | - | ||
45 | -def rddparse_data_ILS(raw_row): | ||
46 | - """ | ||
47 | - input: (u'key0',u'cf_feat:hog:[0.056273,...]--%--cf_pic:data:\ufffd\ufffd\...--%--cf_tag:hog:True') | ||
48 | - return: ([0.056273,...],1) | ||
49 | - | ||
50 | - In fact we can also use mapValues. | ||
51 | - """ | ||
52 | - key = raw_row[0] | ||
53 | - # if key == '04650c488a2b163ca8a1f52da6022f03.jpg': | ||
54 | - # with open('/tmp/hhhh','wb') as f: | ||
55 | - # f.write(raw_row[1].decode('unicode-escape')).encode('latin-1') | ||
56 | - items = raw_row[1].decode('unicode-escape').encode('latin-1').split('--%--') | ||
57 | - data = items[0].split('cf_pic:data:')[-1] | ||
58 | - return (key, data) | ||
59 | - | ||
60 | - | ||
61 | -def rddparse_all_ILS(raw_row): | ||
62 | - """ | ||
63 | - Deprecated | ||
64 | - """ | ||
65 | - key = raw_row[0] | ||
66 | - items = raw_row[1].decode('unicode-escape').encode('latin-1').split('--%--') | ||
67 | - | ||
68 | - # @TODO | ||
69 | - # N.B "ValueError: No JSON object could be decoded" Because the spark-hbase IO is based on strings. | ||
70 | - # And the order of items is not as expected. See ../res/row-sample.txt or check in hbase shell for that. | ||
71 | - | ||
72 | - data = [items[0].split('cf_pic:data:')[-1]] + [json.loads(item.split(':')[-1]) for item in | ||
73 | - items[1:]] | ||
74 | - | ||
75 | - return (key, data) | ||
76 | - | ||
77 | - | ||
78 | -def rddparse_dataset_ILS(raw_row): | ||
79 | - if raw_row[0] == '04650c488a2b163ca8a1f52da6022f03.jpg': | ||
80 | - print raw_row | ||
81 | - items = raw_row[1].decode('unicode-escape').encode('latin-1').split('--%--') | ||
82 | - # tag = int(items[-2].split('cf_tag:' + tagtype)[-1]) | ||
83 | - # feat = [item for sublist in json.loads(items[-1].split('cf_feat:' + feattype)[-1]) for subsublist in sublist for item in subsublist] | ||
84 | - tag = int(items[-1].split(':')[-1]) | ||
85 | - feat = [item for sublist in json.loads(items[0].split(':')[-1]) for subsublist in sublist for | ||
86 | - item in subsublist] | ||
87 | - | ||
88 | - return (tag, feat) | ||
89 | - | ||
90 | - | ||
91 | -def rddinfo_ILS(img, info_rate=None, tag_chosen=None, tag_class=None): | ||
92 | - """ | ||
93 | - Tempfile is our friend. (?) | ||
94 | - """ | ||
95 | - info_rate = info_rate if info_rate != None else 0.0 | ||
96 | - tag_chosen = tag_chosen if tag_chosen != None else stats.bernoulli.rvs(0.8) | ||
97 | - tag_class = tag_class if tag_class != None else 0 | ||
98 | - try: | ||
99 | - tmpf = tempfile.NamedTemporaryFile(suffix='.jpg', mode='w+b', delete=True) | ||
100 | - tmpf.write(img) | ||
101 | - tmpf.seek(0) | ||
102 | - im = Jpeg(tmpf.name, key=sample_key) | ||
103 | - info = [ | ||
104 | - im.image_width, | ||
105 | - im.image_height, | ||
106 | - im.image_width * im.image_height, | ||
107 | - im.getCapacity(), | ||
108 | - im.getQuality(), | ||
109 | - info_rate, | ||
110 | - tag_chosen, | ||
111 | - tag_class | ||
112 | - ] | ||
113 | - return info | ||
114 | - except Exception as e: | ||
115 | - print e | ||
116 | - raise | ||
117 | - finally: | ||
118 | - tmpf.close() | ||
119 | - | ||
120 | - | ||
121 | -def rddembed_ILS(row, rate=None): | ||
122 | - """ | ||
123 | - input: | ||
124 | - e.g. row =('row1',[1,3400,'hello']) | ||
125 | - return: | ||
126 | - newrow = ('row2',[34,5400,'embeded']) | ||
127 | - """ | ||
128 | - items = row[1] | ||
129 | - capacity, chosen = int(items[4]), int(items[7]) | ||
130 | - if chosen == 0: | ||
131 | - return None | ||
132 | - try: | ||
133 | - tmpf_src = tempfile.NamedTemporaryFile(suffix='.jpg', mode='w+b') | ||
134 | - tmpf_src.write(items[0]) | ||
135 | - tmpf_src.seek(0) | ||
136 | - tmpf_dst = tempfile.NamedTemporaryFile(suffix='.jpg', mode='w+b') | ||
137 | - | ||
138 | - steger = F5.F5(sample_key, 1) | ||
139 | - | ||
140 | - if rate == None: | ||
141 | - embed_rate = steger.embed_raw_data(tmpf_src.name, | ||
142 | - os.path.join(package_dir, '../res/toembed'), | ||
143 | - tmpf_dst.name) | ||
144 | - else: | ||
145 | - assert (rate >= 0 and rate < 1) | ||
146 | - # print capacity | ||
147 | - hidden = np.random.bytes(int(int(capacity) * rate) / 8) | ||
148 | - embed_rate = steger.embed_raw_data(tmpf_src.name, hidden, tmpf_dst.name, frommem=True) | ||
149 | - | ||
150 | - tmpf_dst.seek(0) | ||
151 | - raw = tmpf_dst.read() | ||
152 | - index = md5(raw).hexdigest() | ||
153 | - | ||
154 | - return (index + '.jpg', [raw] + rddinfo_ILS(raw, embed_rate, 0, 1)) | ||
155 | - | ||
156 | - except Exception as e: | ||
157 | - print e | ||
158 | - raise | ||
159 | - finally: | ||
160 | - tmpf_src.close() | ||
161 | - tmpf_dst.close() | ||
162 | - | ||
163 | - | ||
164 | -def rddembed_ILS_EXT(row, rate=None): | ||
165 | - """ | ||
166 | - input: | ||
167 | - e.g. row =('row1',[1,3400,'hello']) | ||
168 | - return: | ||
169 | - newrow = ('row2',[34,5400,'embeded']) or NULL | ||
170 | - [row,newrow] | ||
171 | - """ | ||
172 | - items = row[1] | ||
173 | - capacity, chosen = int(items[4]), int(items[7]) | ||
174 | - if chosen == 0: | ||
175 | - return [row] | ||
176 | - try: | ||
177 | - tmpf_src = tempfile.NamedTemporaryFile(suffix='.jpg', mode='w+b') | ||
178 | - tmpf_src.write(items[0]) | ||
179 | - tmpf_src.seek(0) | ||
180 | - tmpf_dst = tempfile.NamedTemporaryFile(suffix='.jpg', mode='w+b') | ||
181 | - | ||
182 | - steger = F5.F5(sample_key, 2) | ||
183 | - | ||
184 | - if rate == None: | ||
185 | - embed_rate = steger.embed_raw_data(tmpf_src.name, | ||
186 | - os.path.join(package_dir, '../res/toembed'), | ||
187 | - tmpf_dst.name) | ||
188 | - else: | ||
189 | - assert (rate >= 0 and rate < 1) | ||
190 | - # print capacity | ||
191 | - hidden = np.random.bytes(int(int(capacity) * rate) / 8) | ||
192 | - embed_rate = steger.embed_raw_data(tmpf_src.name, hidden, tmpf_dst.name, frommem=True) | ||
193 | - | ||
194 | - tmpf_dst.seek(0) | ||
195 | - raw = tmpf_dst.read() | ||
196 | - index = md5(raw).hexdigest() | ||
197 | - | ||
198 | - return [row, (index + '.jpg', [raw] + rddinfo_ILS(raw, embed_rate, 0, 1))] | ||
199 | - | ||
200 | - except Exception as e: | ||
201 | - print e | ||
202 | - raise | ||
203 | - finally: | ||
204 | - tmpf_src.close() | ||
205 | - tmpf_dst.close() | ||
206 | - | ||
207 | - | ||
208 | -def _get_feat(image, feattype='ibd', **kwargs): | ||
209 | - if feattype == 'ibd': | ||
210 | - feater = IntraBlockDiff.FeatIntraBlockDiff() | ||
211 | - else: | ||
212 | - raise Exception("Unknown feature type!") | ||
213 | - | ||
214 | - desc = feater.feat(image) | ||
215 | - | ||
216 | - return desc | ||
217 | - | ||
218 | - | ||
219 | -def rddfeat_ILS(items, feattype='ibd', **kwargs): | ||
220 | - try: | ||
221 | - tmpf_src = tempfile.NamedTemporaryFile(suffix='.jpg', mode='w+b') | ||
222 | - tmpf_src.write(items[0]) | ||
223 | - tmpf_src.seek(0) | ||
224 | - | ||
225 | - desc = json.dumps(_get_feat(tmpf_src.name, feattype=feattype).tolist()) | ||
226 | - # print 'desccccccccccccccccccc',desc | ||
227 | - return items + [desc] | ||
228 | - | ||
229 | - except Exception as e: | ||
230 | - print e | ||
231 | - raise | ||
232 | - finally: | ||
233 | - tmpf_src.close() | ||
234 | - | ||
235 | - | ||
236 | -def rddanalysis_ILS(items, feattype='ibd', **kwargs): | ||
237 | - head = np.fromstring(items[0][:2], dtype=np.uint8) | ||
238 | - if not np.array_equal(head, [255, 216]): | ||
239 | - return items + [0] | ||
240 | - try: | ||
241 | - tmpf_src = tempfile.NamedTemporaryFile(suffix='.jpg', mode='w+b') | ||
242 | - tmpf_src.write(items[0]) | ||
243 | - tmpf_src.seek(0) | ||
244 | - | ||
245 | - desc = _get_feat(tmpf_src.name, feattype=feattype) | ||
246 | - tag = classifier.predict(desc.ravel())[0] | ||
247 | - # print 'desccccccccccccccccccc',desc | ||
248 | - return items + [tag] | ||
249 | - | ||
250 | - except Exception as e: | ||
251 | - print e | ||
252 | - raise | ||
253 | - finally: | ||
254 | - tmpf_src.close() | ||
255 | - | ||
256 | - # return items + classifier.predict(items[-1]) | ||
257 | - | ||
258 | - | ||
259 | -def format_out(row, cols, withdata=False): | ||
260 | - """ | ||
261 | - input: | ||
262 | - e.g. row =('row1',[1,3400,'hello']) | ||
263 | - cols = [['cf_info', 'id'], ['cf_info', 'size'], ['cf_tag', 'desc']] | ||
264 | - return: | ||
265 | - [('row1',['row1', 'cf_info', 'id', '1']),('row1',['row1', 'cf_info', 'size', '3400']),('row1',['row1', 'cf_tag', 'desc', 'hello'])] | ||
266 | - """ | ||
267 | - puts = [] | ||
268 | - key = row[0] | ||
269 | - # if key == '04650c488a2b163ca8a1f52da6022f03.jpg': | ||
270 | - # print row | ||
271 | - if not withdata: | ||
272 | - for data, col in zip(row[1][1:], cols[1:]): | ||
273 | - puts.append((key, [key] + col + [str(data)])) | ||
274 | - else: | ||
275 | - for data, col in zip(row[1], cols): | ||
276 | - puts.append((key, [key] + col + [str(data)])) | ||
277 | - return puts | ||
278 | - | ||
279 | - | ||
280 | -# scconf = SparkConf() | ||
281 | -# scconf.setSparkHome("HPC-server") \ | ||
282 | -# .setMaster("spark://HPC-server:7077") \ | ||
283 | -# .setAppName("example") | ||
284 | -# sc = SparkContext(conf=scconf) | ||
285 | -# | ||
286 | -# | ||
287 | -# def read_hbase(table_name, func=None, collect=False): | ||
288 | -# """ | ||
289 | -# ref - http://happybase.readthedocs.org/en/latest/user.html#retrieving-data | ||
290 | -# | ||
291 | -# Filter format: | ||
292 | -# columns=['cf1:col1', 'cf1:col2'] | ||
293 | -# or | ||
294 | -# columns=['cf1'] | ||
295 | -# | ||
296 | -# """ | ||
297 | -# | ||
298 | -# hconf = { | ||
299 | -# "hbase.zookeeper.quorum": "HPC-server, HPC, HPC2", | ||
300 | -# # "hbase.zookeeper.quorum": self.host, | ||
301 | -# "hbase.mapreduce.inputtable": table_name, | ||
302 | -# } | ||
303 | -# | ||
304 | -# hbase_rdd = sc.newAPIHadoopRDD(inputFormatClass=hparams["inputFormatClass"], | ||
305 | -# keyClass=hparams["readKeyClass"], | ||
306 | -# valueClass=hparams["readValueClass"], | ||
307 | -# keyConverter=hparams["readKeyConverter"], | ||
308 | -# valueConverter=hparams["readValueConverter"], | ||
309 | -# conf=hconf) | ||
310 | -# | ||
311 | -# parser = func if func != None else rddparse_data_CV | ||
312 | -# hbase_rdd = hbase_rdd.map(lambda x: parser(x)) | ||
313 | -# | ||
314 | -# if collect: | ||
315 | -# return hbase_rdd.collect() | ||
316 | -# else: | ||
317 | -# return hbase_rdd | ||
318 | -# | ||
319 | -# | ||
320 | -# def write_hbase(table_name, data, fromrdd=False, columns=None, withdata=False): | ||
321 | -# """ | ||
322 | -# Data Format: (Deprecated) | ||
323 | -# e.g. [["row8", "f1", "", "caocao cao"], ["row9", "f1", "c1", "asdfg hhhh"]] | ||
324 | -# | ||
325 | -# Data(from dictionary): | ||
326 | -# e.g. data ={'row1':[1,3400,'hello'], 'row2':[34,5000,'here in mine']}, | ||
327 | -# cols = ['cf_info:id', 'cf_info:size', 'cf_tag:desc'] | ||
328 | -# Data(from Rdd): | ||
329 | -# e.g. data =[('row1',[1,3400,'hello']), ('row2',[34,5000,'here in mine'])], | ||
330 | -# cols = ['cf_info:id', 'cf_info:size', 'cf_tag:desc'] | ||
331 | -# """ | ||
332 | -# hconf = { | ||
333 | -# "hbase.zookeeper.quorum": "HPC-server, HPC, HPC2", # "hbase.zookeeper.quorum": self.host, | ||
334 | -# "hbase.mapreduce.inputtable": table_name, | ||
335 | -# "hbase.mapred.outputtable": table_name, | ||
336 | -# "mapreduce.outputformat.class": hparams["outputFormatClass"], | ||
337 | -# "mapreduce.job.output.key.class": hparams["writeKeyClass"], | ||
338 | -# "mapreduce.job.output.value.class": hparams["writeValueClass"], | ||
339 | -# } | ||
340 | -# cols = [col.split(':') for col in columns] | ||
341 | -# if not fromrdd: | ||
342 | -# rdd_data = sc.parallelize(data) | ||
343 | -# else: | ||
344 | -# rdd_data = data | ||
345 | -# | ||
346 | -# rdd_data.flatMap(lambda x: format_out(x, cols, withdata=withdata)).saveAsNewAPIHadoopDataset( | ||
347 | -# conf=hconf, | ||
348 | -# keyConverter=hparams["writeKeyConverter"], | ||
349 | -# valueConverter=hparams["writeValueConverter"]) | ||
350 | 21 | ||
351 | 22 | ||
352 | class Sparker(object): | 23 | class Sparker(object): |
@@ -0,0 +1,267 @@ | @@ -0,0 +1,267 @@ | ||
1 | +__author__ = 'hadoop' | ||
2 | + | ||
3 | +from ..common import * | ||
4 | + | ||
5 | +from ..mjpeg import * | ||
6 | +from ..msteg import * | ||
7 | +from ..msteg.steganography import LSB, F3, F4, F5 | ||
8 | +from ..mfeat import IntraBlockDiff | ||
9 | +from ..mmodel.svm import SVM | ||
10 | + | ||
11 | +from numpy import array | ||
12 | +import json | ||
13 | +import pickle | ||
14 | +import tempfile | ||
15 | + | ||
16 | +import numpy as np | ||
17 | +from scipy import stats | ||
18 | +from hashlib import md5 | ||
19 | + | ||
20 | +np.random.seed(sum(map(ord, "whoami"))) | ||
21 | +package_dir = os.path.dirname(os.path.abspath(__file__)) | ||
22 | +classifier = SVM.ModelSVM(toolset='sklearn') | ||
23 | + | ||
24 | +def rddparse_data_CV(raw_row): | ||
25 | + """ | ||
26 | + input: (u'key0',u'cf_feat:hog:[0.056273,...]--%--cf_pic:data:\ufffd\ufffd\...--%--cf_tag:hog:True') | ||
27 | + return: ([0.056273,...],1) | ||
28 | + """ | ||
29 | + data = raw_row[1].split('--%--') | ||
30 | + feat = json.loads(data[0].split(':')[-1]) | ||
31 | + tag = 1 if data[-1].split(':')[-1] == 'True' else 0 | ||
32 | + return (feat, tag) | ||
33 | + | ||
34 | + | ||
35 | +def rddparse_data_ILS(raw_row): | ||
36 | + """ | ||
37 | + input: (u'key0',u'cf_feat:hog:[0.056273,...]--%--cf_pic:data:\ufffd\ufffd\...--%--cf_tag:hog:True') | ||
38 | + return: ([0.056273,...],1) | ||
39 | + | ||
40 | + In fact we can also use mapValues. | ||
41 | + """ | ||
42 | + key = raw_row[0] | ||
43 | + # if key == '04650c488a2b163ca8a1f52da6022f03.jpg': | ||
44 | + # with open('/tmp/hhhh','wb') as f: | ||
45 | + # f.write(raw_row[1].decode('unicode-escape')).encode('latin-1') | ||
46 | + items = raw_row[1].decode('unicode-escape').encode('latin-1').split('--%--') | ||
47 | + data = items[0].split('cf_pic:data:')[-1] | ||
48 | + return (key, data) | ||
49 | + | ||
50 | + | ||
51 | +def rddparse_all_ILS(raw_row): | ||
52 | + """ | ||
53 | + Deprecated | ||
54 | + """ | ||
55 | + key = raw_row[0] | ||
56 | + items = raw_row[1].decode('unicode-escape').encode('latin-1').split('--%--') | ||
57 | + | ||
58 | + # @TODO | ||
59 | + # N.B "ValueError: No JSON object could be decoded" Because the spark-hbase IO is based on strings. | ||
60 | + # And the order of items is not as expected. See ../res/row-sample.txt or check in hbase shell for that. | ||
61 | + | ||
62 | + data = [items[0].split('cf_pic:data:')[-1]] + [json.loads(item.split(':')[-1]) for item in | ||
63 | + items[1:]] | ||
64 | + | ||
65 | + return (key, data) | ||
66 | + | ||
67 | + | ||
68 | +def rddparse_dataset_ILS(raw_row): | ||
69 | + if raw_row[0] == '04650c488a2b163ca8a1f52da6022f03.jpg': | ||
70 | + print raw_row | ||
71 | + items = raw_row[1].decode('unicode-escape').encode('latin-1').split('--%--') | ||
72 | + # tag = int(items[-2].split('cf_tag:' + tagtype)[-1]) | ||
73 | + # feat = [item for sublist in json.loads(items[-1].split('cf_feat:' + feattype)[-1]) for subsublist in sublist for item in subsublist] | ||
74 | + tag = int(items[-1].split(':')[-1]) | ||
75 | + feat = [item for sublist in json.loads(items[0].split(':')[-1]) for subsublist in sublist for | ||
76 | + item in subsublist] | ||
77 | + | ||
78 | + return (tag, feat) | ||
79 | + | ||
80 | + | ||
81 | +def rddinfo_ILS(img, info_rate=None, tag_chosen=None, tag_class=None): | ||
82 | + """ | ||
83 | + Tempfile is our friend. (?) | ||
84 | + """ | ||
85 | + info_rate = info_rate if info_rate != None else 0.0 | ||
86 | + tag_chosen = tag_chosen if tag_chosen != None else stats.bernoulli.rvs(0.8) | ||
87 | + tag_class = tag_class if tag_class != None else 0 | ||
88 | + try: | ||
89 | + tmpf = tempfile.NamedTemporaryFile(suffix='.jpg', mode='w+b', delete=True) | ||
90 | + tmpf.write(img) | ||
91 | + tmpf.seek(0) | ||
92 | + im = Jpeg(tmpf.name, key=sample_key) | ||
93 | + info = [ | ||
94 | + im.image_width, | ||
95 | + im.image_height, | ||
96 | + im.image_width * im.image_height, | ||
97 | + im.getCapacity(), | ||
98 | + im.getQuality(), | ||
99 | + info_rate, | ||
100 | + tag_chosen, | ||
101 | + tag_class | ||
102 | + ] | ||
103 | + return info | ||
104 | + except Exception as e: | ||
105 | + print e | ||
106 | + raise | ||
107 | + finally: | ||
108 | + tmpf.close() | ||
109 | + | ||
110 | + | ||
111 | +def rddembed_ILS(row, rate=None): | ||
112 | + """ | ||
113 | + input: | ||
114 | + e.g. row =('row1',[1,3400,'hello']) | ||
115 | + return: | ||
116 | + newrow = ('row2',[34,5400,'embeded']) | ||
117 | + """ | ||
118 | + items = row[1] | ||
119 | + capacity, chosen = int(items[4]), int(items[7]) | ||
120 | + if chosen == 0: | ||
121 | + return None | ||
122 | + try: | ||
123 | + tmpf_src = tempfile.NamedTemporaryFile(suffix='.jpg', mode='w+b') | ||
124 | + tmpf_src.write(items[0]) | ||
125 | + tmpf_src.seek(0) | ||
126 | + tmpf_dst = tempfile.NamedTemporaryFile(suffix='.jpg', mode='w+b') | ||
127 | + | ||
128 | + steger = F5.F5(sample_key, 1) | ||
129 | + | ||
130 | + if rate == None: | ||
131 | + embed_rate = steger.embed_raw_data(tmpf_src.name, | ||
132 | + os.path.join(package_dir, '../res/toembed'), | ||
133 | + tmpf_dst.name) | ||
134 | + else: | ||
135 | + assert (rate >= 0 and rate < 1) | ||
136 | + # print capacity | ||
137 | + hidden = np.random.bytes(int(int(capacity) * rate) / 8) | ||
138 | + embed_rate = steger.embed_raw_data(tmpf_src.name, hidden, tmpf_dst.name, frommem=True) | ||
139 | + | ||
140 | + tmpf_dst.seek(0) | ||
141 | + raw = tmpf_dst.read() | ||
142 | + index = md5(raw).hexdigest() | ||
143 | + | ||
144 | + return (index + '.jpg', [raw] + rddinfo_ILS(raw, embed_rate, 0, 1)) | ||
145 | + | ||
146 | + except Exception as e: | ||
147 | + print e | ||
148 | + raise | ||
149 | + finally: | ||
150 | + tmpf_src.close() | ||
151 | + tmpf_dst.close() | ||
152 | + | ||
153 | + | ||
154 | +def rddembed_ILS_EXT(row, rate=None): | ||
155 | + """ | ||
156 | + input: | ||
157 | + e.g. row =('row1',[1,3400,'hello']) | ||
158 | + return: | ||
159 | + newrow = ('row2',[34,5400,'embeded']) or NULL | ||
160 | + [row,newrow] | ||
161 | + """ | ||
162 | + items = row[1] | ||
163 | + capacity, chosen = int(items[4]), int(items[7]) | ||
164 | + if chosen == 0: | ||
165 | + return [row] | ||
166 | + try: | ||
167 | + tmpf_src = tempfile.NamedTemporaryFile(suffix='.jpg', mode='w+b') | ||
168 | + tmpf_src.write(items[0]) | ||
169 | + tmpf_src.seek(0) | ||
170 | + tmpf_dst = tempfile.NamedTemporaryFile(suffix='.jpg', mode='w+b') | ||
171 | + | ||
172 | + steger = F5.F5(sample_key, 2) | ||
173 | + | ||
174 | + if rate == None: | ||
175 | + embed_rate = steger.embed_raw_data(tmpf_src.name, | ||
176 | + os.path.join(package_dir, '../res/toembed'), | ||
177 | + tmpf_dst.name) | ||
178 | + else: | ||
179 | + assert (rate >= 0 and rate < 1) | ||
180 | + # print capacity | ||
181 | + hidden = np.random.bytes(int(int(capacity) * rate) / 8) | ||
182 | + embed_rate = steger.embed_raw_data(tmpf_src.name, hidden, tmpf_dst.name, frommem=True) | ||
183 | + | ||
184 | + tmpf_dst.seek(0) | ||
185 | + raw = tmpf_dst.read() | ||
186 | + index = md5(raw).hexdigest() | ||
187 | + | ||
188 | + return [row, (index + '.jpg', [raw] + rddinfo_ILS(raw, embed_rate, 0, 1))] | ||
189 | + | ||
190 | + except Exception as e: | ||
191 | + print e | ||
192 | + raise | ||
193 | + finally: | ||
194 | + tmpf_src.close() | ||
195 | + tmpf_dst.close() | ||
196 | + | ||
197 | + | ||
198 | +def _get_feat(image, feattype='ibd', **kwargs): | ||
199 | + if feattype == 'ibd': | ||
200 | + feater = IntraBlockDiff.FeatIntraBlockDiff() | ||
201 | + else: | ||
202 | + raise Exception("Unknown feature type!") | ||
203 | + | ||
204 | + desc = feater.feat(image) | ||
205 | + | ||
206 | + return desc | ||
207 | + | ||
208 | + | ||
209 | +def rddfeat_ILS(items, feattype='ibd', **kwargs): | ||
210 | + try: | ||
211 | + tmpf_src = tempfile.NamedTemporaryFile(suffix='.jpg', mode='w+b') | ||
212 | + tmpf_src.write(items[0]) | ||
213 | + tmpf_src.seek(0) | ||
214 | + | ||
215 | + desc = json.dumps(_get_feat(tmpf_src.name, feattype=feattype).tolist()) | ||
216 | + # print 'desccccccccccccccccccc',desc | ||
217 | + return items + [desc] | ||
218 | + | ||
219 | + except Exception as e: | ||
220 | + print e | ||
221 | + raise | ||
222 | + finally: | ||
223 | + tmpf_src.close() | ||
224 | + | ||
225 | + | ||
226 | +def rddanalysis_ILS(items, feattype='ibd', **kwargs): | ||
227 | + head = np.fromstring(items[0][:2], dtype=np.uint8) | ||
228 | + if not np.array_equal(head, [255, 216]): | ||
229 | + return items + [0] | ||
230 | + try: | ||
231 | + tmpf_src = tempfile.NamedTemporaryFile(suffix='.jpg', mode='w+b') | ||
232 | + tmpf_src.write(items[0]) | ||
233 | + tmpf_src.seek(0) | ||
234 | + | ||
235 | + desc = _get_feat(tmpf_src.name, feattype=feattype) | ||
236 | + tag = classifier.predict(desc.ravel())[0] | ||
237 | + # print 'desccccccccccccccccccc',desc | ||
238 | + return items + [tag] | ||
239 | + | ||
240 | + except Exception as e: | ||
241 | + print e | ||
242 | + raise | ||
243 | + finally: | ||
244 | + tmpf_src.close() | ||
245 | + | ||
246 | + # return items + classifier.predict(items[-1]) | ||
247 | + | ||
248 | + | ||
249 | +def format_out(row, cols, withdata=False): | ||
250 | + """ | ||
251 | + input: | ||
252 | + e.g. row =('row1',[1,3400,'hello']) | ||
253 | + cols = [['cf_info', 'id'], ['cf_info', 'size'], ['cf_tag', 'desc']] | ||
254 | + return: | ||
255 | + [('row1',['row1', 'cf_info', 'id', '1']),('row1',['row1', 'cf_info', 'size', '3400']),('row1',['row1', 'cf_tag', 'desc', 'hello'])] | ||
256 | + """ | ||
257 | + puts = [] | ||
258 | + key = row[0] | ||
259 | + # if key == '04650c488a2b163ca8a1f52da6022f03.jpg': | ||
260 | + # print row | ||
261 | + if not withdata: | ||
262 | + for data, col in zip(row[1][1:], cols[1:]): | ||
263 | + puts.append((key, [key] + col + [str(data)])) | ||
264 | + else: | ||
265 | + for data, col in zip(row[1], cols): | ||
266 | + puts.append((key, [key] + col + [str(data)])) | ||
267 | + return puts |