Commit ea1eb31a0f395ca7810bb29b79184052b23dbdf8

Authored by Chunk
1 parent ad70caf6
Exists in master and in 1 other branch refactor

spark is privileged... we are going to write a special data module to process spark&hbase data.

@@ -89,7 +89,7 @@ class DataCV(DataDumperBase): @@ -89,7 +89,7 @@ class DataCV(DataDumperBase):
89 'cf_info': dict(max_versions=10), 89 'cf_info': dict(max_versions=10),
90 'cf_tag': dict(), 90 'cf_tag': dict(),
91 'cf_feat': dict(), 91 'cf_feat': dict(),
92 - } 92 + }
93 self.connection.create_table(name=self.table_name, families=families) 93 self.connection.create_table(name=self.table_name, families=families)
94 94
95 table = self.connection.table(name=self.table_name) 95 table = self.connection.table(name=self.table_name)
@@ -250,7 +250,7 @@ class DataCV(DataDumperBase): @@ -250,7 +250,7 @@ class DataCV(DataDumperBase):
250 if self.sparkcontex == None: 250 if self.sparkcontex == None:
251 self.sparkcontex = SC.Sparker(host='HPC-server', appname='ImageCV', master='spark://HPC-server:7077') 251 self.sparkcontex = SC.Sparker(host='HPC-server', appname='ImageCV', master='spark://HPC-server:7077')
252 252
253 - result = self.sparkcontex.read_habase(self.table_name) # result = {key:[feat,tag],...} 253 + result = self.sparkcontex.read_hbase(self.table_name, collect=True) # result = {key:[feat,tag],...}
254 for feat, tag in result: 254 for feat, tag in result:
255 X.append(feat) 255 X.append(feat)
256 Y.append(tag) 256 Y.append(tag)
mdata/ILSVRC-S.py 0 → 100644
@@ -0,0 +1,442 @@ @@ -0,0 +1,442 @@
  1 +__author__ = 'chunk'
  2 +
  3 +from . import *
  4 +from ..mfeat import HOG, IntraBlockDiff
  5 +from ..mspark import SC
  6 +from ..common import *
  7 +
  8 +import os, sys
  9 +from PIL import Image
  10 +from hashlib import md5
  11 +import csv
  12 +import shutil
  13 +import json
  14 +import collections
  15 +import happybase
  16 +
  17 +from ..mjpeg import *
  18 +from ..msteg import *
  19 +from ..msteg.steganography import LSB, F3, F4, F5
  20 +
  21 +import numpy as np
  22 +from numpy.random import randn
  23 +import pandas as pd
  24 +from scipy import stats
  25 +
  26 +from subprocess import Popen, PIPE, STDOUT
  27 +
  28 +
  29 +np.random.seed(sum(map(ord, "whoami")))
  30 +
  31 +package_dir = os.path.dirname(os.path.abspath(__file__))
  32 +
  33 +
  34 +class DataILSVRCS(DataDumperBase):
  35 + def __init__(self, base_dir='/media/chunk/Elements/D/data/ImageNet/img/ILSVRC2013_DET_val', category='Train'):
  36 + DataDumperBase.__init__(self, base_dir, category)
  37 +
  38 + self.base_dir = base_dir
  39 + self.category = category
  40 + self.data_dir = os.path.join(self.base_dir, self.category)
  41 +
  42 + self.dst_dir = os.path.join(self.base_dir, 'dst', self.category)
  43 + self.list_file = os.path.join(self.dst_dir, 'file-tag.tsv')
  44 + self.feat_dir = os.path.join(self.dst_dir, 'Feat')
  45 + self.img_dir = os.path.join(self.dst_dir, 'Img')
  46 +
  47 + self.dict_data = {}
  48 +
  49 + self.table_name = self.base_dir.strip('/').split('/')[-1] + '-' + self.category
  50 + self.sparkcontex = None
  51 +
  52 + def format(self):
  53 + self.extract()
  54 +
  55 + def _hash_copy(self, image):
  56 + if not image.endswith('jpg'):
  57 + img = Image.open(image)
  58 + img.save('../res/tmp.jpg', format='JPEG')
  59 + image = '../res/tmp.jpg'
  60 +
  61 + with open(image, 'rb') as f:
  62 + index = md5(f.read()).hexdigest()
  63 +
  64 + im = Jpeg(image, key=sample_key)
  65 + self.dict_data[index] = [im.image_width, im.image_height, im.image_width * im.image_height, im.getCapacity(),
  66 + im.getQuality()]
  67 +
  68 + # self.dict_data[index] = [im.image_width, im.image_height, os.path.getsize(image), im.getQuality()]
  69 +
  70 + # origion:
  71 + # dir = base_dir + 'Img/Train/' + index[:3]
  72 + dir = os.path.join(self.img_dir, index[:3])
  73 + if not os.path.exists(dir):
  74 + os.makedirs(dir)
  75 + image_path = os.path.join(dir, index[3:] + '.jpg')
  76 + # print image_path
  77 +
  78 + if not os.path.exists(image_path):
  79 + shutil.copy(image, image_path)
  80 + else:
  81 + pass
  82 +
  83 + def get_feat(self, image, feattype='ibd', **kwargs):
  84 + size = kwargs.get('size', (48, 48))
  85 +
  86 + if feattype == 'hog':
  87 + feater = HOG.FeatHOG(size=size)
  88 + elif feattype == 'ibd':
  89 + feater = IntraBlockDiff.FeatIntraBlockDiff()
  90 + else:
  91 + raise Exception("Unknown feature type!")
  92 +
  93 + desc = feater.feat(image)
  94 +
  95 + return desc
  96 +
  97 +
  98 + def extract_feat(self, feattype='ibd'):
  99 + if feattype == 'hog':
  100 + feater = HOG.FeatHOG(size=(48, 48))
  101 + elif feattype == 'ibd':
  102 + feater = IntraBlockDiff.FeatIntraBlockDiff()
  103 + else:
  104 + raise Exception("Unknown feature type!")
  105 +
  106 + list_image = []
  107 + with open(self.list_file, 'rb') as tsvfile:
  108 + tsvfile = csv.reader(tsvfile, delimiter='\t')
  109 + for line in tsvfile:
  110 + list_image.append(line[0])
  111 +
  112 + dict_featbuf = {}
  113 + for imgname in list_image:
  114 + # if imgtag == 'True':
  115 + image = os.path.join(self.img_dir, imgname[:3], imgname[3:] + '.jpg')
  116 + desc = feater.feat(image)
  117 + dict_featbuf[imgname] = desc
  118 +
  119 + for imgname, desc in dict_featbuf.items():
  120 + # print imgname, desc
  121 + dir = os.path.join(self.feat_dir, imgname[:3])
  122 + if not os.path.exists(dir):
  123 + os.makedirs(dir)
  124 + featpath = os.path.join(dir, imgname[3:].split('.')[0] + '.' + feattype)
  125 + with open(featpath, 'wb') as featfile:
  126 + featfile.write(json.dumps(desc.tolist()))
  127 +
  128 + def _build_list(self, list_file=None):
  129 + if list_file == None:
  130 + list_file = self.list_file
  131 + assert list_file != None
  132 +
  133 + ordict_img = collections.OrderedDict(sorted(self.dict_data.items(), key=lambda d: d[0]))
  134 +
  135 + with open(list_file, 'w') as f:
  136 + tsvfile = csv.writer(f, delimiter='\t')
  137 + for key, value in ordict_img.items():
  138 + tsvfile.writerow([key] + value)
  139 +
  140 + def _anaylis(self, list_file=None):
  141 + if list_file == None:
  142 + list_file = self.list_file
  143 + assert list_file != None
  144 +
  145 + df_ILS = pd.read_csv(list_file, names=['hash', 'width', 'height', 'size', 'capacity', 'quality'], sep='\t')
  146 + length = df_ILS.shape[0]
  147 + df_ILS = df_ILS.sort(['capacity', 'size', 'quality'], ascending=True)
  148 + rand_class = stats.bernoulli.rvs(0.8, size=length)
  149 +
  150 + df_ILS['rate'] = np.zeros(df_ILS.shape[0], np.float64)
  151 + df_ILS['chosen'] = rand_class
  152 + df_ILS['class'] = np.zeros(length, np.int32)
  153 +
  154 + df_ILS.to_csv(list_file, header=False, index=False, sep='\t')
  155 +
  156 + def extract(self):
  157 + for path, subdirs, files in os.walk(self.data_dir):
  158 + for name in files:
  159 + imagepath = os.path.join(path, name)
  160 + # print imagepath
  161 + try:
  162 + self._hash_copy(imagepath)
  163 + except:
  164 + pass
  165 +
  166 + self._build_list()
  167 + self._anaylis()
  168 +
  169 +
  170 + def _embed_outer(self):
  171 + self.dict_data = {}
  172 + dict_embedresult = {}
  173 + os.environ["CLASSPATH"] = os.path.join(package_dir, "../libs/F5/")
  174 + cmd = 'java Embed %s %s -e %s -p password -c "stegan by chunk " -q %d'
  175 +
  176 + df_ILS = pd.read_csv(self.list_file,
  177 + names=['hash', 'width', 'height', 'size', 'capacity', 'quality', 'chosen', 'class'],
  178 + sep='\t')
  179 + df_ILS_TARGET = df_ILS[df_ILS['chosen'] == 1]
  180 +
  181 + for hash, size, quality in zip(df_ILS_TARGET['hash'], df_ILS_TARGET['size'], df_ILS_TARGET['quality']):
  182 + path_img = os.path.join(self.img_dir, hash[:3], hash[3:] + '.jpg')
  183 + if path_img:
  184 + print path_img
  185 + p = Popen(cmd % (path_img, 'res/tmp.jpg', 'res/toembed', quality), shell=True, stdout=PIPE,
  186 + stderr=STDOUT)
  187 + dict_embedresult[hash] = [line.strip('\n') for line in p.stdout.readlines()]
  188 + try:
  189 + self._hash_copy('res/tmp.jpg')
  190 + except:
  191 + pass
  192 +
  193 + with open(self.list_file + '.embed.log', 'wb') as f:
  194 + tsvfile = csv.writer(f, delimiter='\t')
  195 + for key, value in dict_embedresult.items():
  196 + tsvfile.writerow([key] + value)
  197 +
  198 + self._build_list(self.list_file + '.embed')
  199 +
  200 + # merge
  201 + df_ILS_EMBED = pd.read_csv(self.list_file + '.embed', names=['hash', 'width', 'height', 'size', 'quality'],
  202 + sep='\t')
  203 + length = df_ILS_EMBED.shape[0]
  204 + df_ILS_EMBED = df_ILS_EMBED.sort(['size', 'quality'], ascending=True)
  205 + df_ILS_EMBED['chosen'] = np.zeros(length, np.int32)
  206 + df_ILS_EMBED['class'] = np.ones(length, np.int32)
  207 +
  208 + df_ILS = df_ILS.append(df_ILS_EMBED, ignore_index=True)
  209 + df_ILS.to_csv(self.list_file, header=False, index=False, sep='\t')
  210 +
  211 + def _embed_inner(self, rate=None):
  212 + self.dict_data = {}
  213 + f5 = F5.F5(sample_key, 1)
  214 + tmp_img = os.path.join(package_dir, '../res/tmp.jpg')
  215 + df_ILS = pd.read_csv(self.list_file,
  216 + names=['hash', 'width', 'height', 'size', 'capacity', 'quality', 'rate', 'chosen',
  217 + 'class'],
  218 + sep='\t')
  219 + df_ILS_TARGET = df_ILS[df_ILS['chosen'] == 1]
  220 +
  221 + for hash, capacity in zip(df_ILS_TARGET['hash'], df_ILS_TARGET['capacity']):
  222 + path_img = os.path.join(self.img_dir, hash[:3], hash[3:] + '.jpg')
  223 + if path_img:
  224 + print path_img
  225 + if rate == None:
  226 + embed_rate = f5.embed_raw_data(path_img, os.path.join(package_dir, '../res/toembed'), tmp_img)
  227 + else:
  228 + assert (rate >= 0 and rate < 1)
  229 + # print capacity
  230 + hidden = np.random.bytes(int(capacity * rate) / 8)
  231 + embed_rate = f5.embed_raw_data(path_img, hidden, tmp_img, frommem=True)
  232 + try:
  233 + with open(tmp_img, 'rb') as f:
  234 + index = md5(f.read()).hexdigest()
  235 + im = Jpeg(tmp_img, key=sample_key)
  236 + self.dict_data[index] = [im.image_width, im.image_height, im.image_width * im.image_height,
  237 + im.getCapacity(),
  238 + im.getQuality(), embed_rate]
  239 +
  240 + dir = os.path.join(self.img_dir, index[:3])
  241 + if not os.path.exists(dir):
  242 + os.makedirs(dir)
  243 + image_path = os.path.join(dir, index[3:] + '.jpg')
  244 + if not os.path.exists(image_path):
  245 + shutil.copy(tmp_img, image_path)
  246 + else:
  247 + pass
  248 + except:
  249 + pass
  250 +
  251 + self._build_list(self.list_file + '.embed')
  252 +
  253 + # merge
  254 + df_ILS_EMBED = pd.read_csv(self.list_file + '.embed',
  255 + names=['hash', 'width', 'height', 'size', 'capacity', 'quality', 'rate'],
  256 + sep='\t')
  257 +
  258 + df_ILS_EMBED = df_ILS_EMBED.sort(['rate', 'capacity', 'size', 'quality'], ascending=True)
  259 + df_ILS_EMBED['chosen'] = np.zeros(df_ILS_EMBED.shape[0], np.int32)
  260 + df_ILS_EMBED['class'] = np.ones(df_ILS_EMBED.shape[0], np.int32)
  261 +
  262 + # print df_ILS_EMBED.dtypes
  263 + # print df_ILS.dtypes
  264 + # Form the intersection of two Index objects. Sortedness of the result is not guaranteed
  265 + df_ILS = df_ILS.append(df_ILS_EMBED, ignore_index=True)
  266 + df_ILS.to_csv(self.list_file, header=False, index=False, sep='\t')
  267 +
  268 + def embed(self, rate=None):
  269 + self._embed_inner(rate)
  270 +
  271 + def get_table(self):
  272 + if self.table != None:
  273 + return self.table
  274 +
  275 + if self.connection is None:
  276 + c = happybase.Connection('HPC-server')
  277 + self.connection = c
  278 +
  279 + tables = self.connection.tables()
  280 + if self.table_name not in tables:
  281 + families = {'cf_pic': dict(),
  282 + 'cf_info': dict(max_versions=10),
  283 + 'cf_tag': dict(),
  284 + 'cf_feat': dict(),
  285 + }
  286 + self.connection.create_table(name=self.table_name, families=families)
  287 +
  288 + table = self.connection.table(name=self.table_name)
  289 +
  290 + self.table = table
  291 +
  292 + return table
  293 +
  294 +
  295 + def store_img(self):
  296 + if self.table == None:
  297 + self.table = self.get_table()
  298 +
  299 + dict_databuf = {}
  300 +
  301 + with open(self.list_file, 'rb') as tsvfile:
  302 + tsvfile = csv.reader(tsvfile, delimiter='\t')
  303 + for line in tsvfile:
  304 + path_img = os.path.join(self.img_dir, line[0][:3], line[0][3:] + '.jpg')
  305 + if path_img:
  306 + with open(path_img, 'rb') as fpic:
  307 + dict_databuf[line[0] + '.jpg'] = fpic.read()
  308 +
  309 + try:
  310 + with self.table.batch(batch_size=5000) as b:
  311 + for imgname, imgdata in dict_databuf.items():
  312 + b.put(imgname, {'cf_pic:data': imgdata})
  313 + except ValueError:
  314 + raise
  315 +
  316 +
  317 + def store_info(self, infotype='all'):
  318 + if self.table == None:
  319 + self.table = self.get_table()
  320 +
  321 + dict_infobuf = {}
  322 +
  323 + with open(self.list_file, 'rb') as tsvfile:
  324 + tsvfile = csv.reader(tsvfile, delimiter='\t')
  325 + for line in tsvfile:
  326 + dict_infobuf[line[0] + '.jpg'] = line[1:-2]
  327 +
  328 + if infotype == 'all':
  329 + try:
  330 + with self.table.batch(batch_size=5000) as b:
  331 + for imgname, imginfo in dict_infobuf.items():
  332 + b.put(imgname,
  333 + {'cf_info:width': imginfo[0], 'cf_info:height': imginfo[1], 'cf_info:size': imginfo[2],
  334 + 'cf_info:capacity': imginfo[3],
  335 + 'cf_info:quality': imginfo[4]})
  336 + except ValueError:
  337 + raise
  338 + else:
  339 + raise Exception("Unknown infotype!")
  340 +
  341 +
  342 + def store_tag(self, tagtype='all'):
  343 + if self.table == None:
  344 + self.table = self.get_table()
  345 +
  346 + dict_tagbuf = {}
  347 +
  348 + with open(self.list_file, 'rb') as tsvfile:
  349 + tsvfile = csv.reader(tsvfile, delimiter='\t')
  350 + for line in tsvfile:
  351 + dict_tagbuf[line[0] + '.jpg'] = line[-2:]
  352 +
  353 + if tagtype == 'all':
  354 + try:
  355 + with self.table.batch(batch_size=5000) as b:
  356 + for imgname, imgtag in dict_tagbuf.items():
  357 + b.put(imgname, {'cf_tag:chosen': imgtag[0], 'cf_tag:class': imgtag[1]})
  358 + except ValueError:
  359 + raise
  360 + else:
  361 + raise Exception("Unknown tagtype!")
  362 +
  363 +
  364 + def store_feat(self, feattype='ibd'):
  365 + if self.table == None:
  366 + self.table = self.get_table()
  367 +
  368 + dict_featbuf = {}
  369 + for path, subdirs, files in os.walk(self.feat_dir):
  370 + for name in files:
  371 + featpath = os.path.join(path, name)
  372 + # print featpath
  373 + with open(featpath, 'rb') as featfile:
  374 + imgname = path.split('/')[-1] + name.replace('.' + feattype, '.jpg')
  375 + dict_featbuf[imgname] = featfile.read()
  376 +
  377 + try:
  378 + with self.table.batch(batch_size=5000) as b:
  379 + for imgname, featdesc in dict_featbuf.items():
  380 + b.put(imgname, {'cf_feat:' + feattype: featdesc})
  381 + except ValueError:
  382 + raise
  383 + pass
  384 +
  385 +
  386 + def load_data(self, mode='local', feattype='ibd', tagtype='class'):
  387 + INDEX = []
  388 + X = []
  389 + Y = []
  390 +
  391 + if mode == "local":
  392 +
  393 + dict_dataset = {}
  394 +
  395 + with open(self.list_file, 'rb') as tsvfile:
  396 + tsvfile = csv.reader(tsvfile, delimiter='\t')
  397 + for line in tsvfile:
  398 + hash = line[0]
  399 + tag = line[-1]
  400 + path_feat = os.path.join(self.feat_dir, hash[:3], hash[3:] + '.' + feattype)
  401 + if path_feat:
  402 + with open(path_feat, 'rb') as featfile:
  403 + dict_dataset[hash] = (tag, json.loads(featfile.read()))
  404 +
  405 + for tag, feat in dict_dataset.values():
  406 + X.append([item for sublist in feat for subsublist in sublist for item in subsublist])
  407 + Y.append(int(tag))
  408 +
  409 + elif mode == "remote" or mode == "hbase":
  410 + if self.table == None:
  411 + self.table = self.get_table()
  412 +
  413 + col_feat, col_tag = 'cf_feat:' + feattype, 'cf_tag:' + tagtype
  414 + for key, data in self.table.scan(columns=[col_feat, col_tag]):
  415 + X.append(json.loads(data[col_feat]))
  416 + Y.append(1 if data[col_tag] == 'True' else 0)
  417 +
  418 + elif mode == "spark" or mode == "cluster":
  419 + if self.sparkcontex == None:
  420 + self.sparkcontex = SC.Sparker(host='HPC-server', appname='ImageCV', master='spark://HPC-server:7077')
  421 +
  422 + result = self.sparkcontex.read_hbase(self.table_name) # result = {key:[feat,tag],...}
  423 + for feat, tag in result:
  424 + X.append(feat)
  425 + Y.append(tag)
  426 +
  427 + else:
  428 + raise Exception("Unknown mode!")
  429 +
  430 + return X, Y
  431 +
  432 +
  433 +
  434 +
  435 +
  436 +
  437 +
  438 +
  439 +
  440 +
  441 +
  442 +
mdata/ILSVRC.py
@@ -419,7 +419,7 @@ class DataILSVRC(DataDumperBase): @@ -419,7 +419,7 @@ class DataILSVRC(DataDumperBase):
419 if self.sparkcontex == None: 419 if self.sparkcontex == None:
420 self.sparkcontex = SC.Sparker(host='HPC-server', appname='ImageCV', master='spark://HPC-server:7077') 420 self.sparkcontex = SC.Sparker(host='HPC-server', appname='ImageCV', master='spark://HPC-server:7077')
421 421
422 - result = self.sparkcontex.read_habase(self.table_name) # result = {key:[feat,tag],...} 422 + result = self.sparkcontex.read_hbase(self.table_name) # result = {key:[feat,tag],...}
423 for feat, tag in result: 423 for feat, tag in result:
424 X.append(feat) 424 X.append(feat)
425 Y.append(tag) 425 Y.append(tag)
@@ -260,7 +260,7 @@ class DataMSR(DataDumperBase): @@ -260,7 +260,7 @@ class DataMSR(DataDumperBase):
260 if self.sparkcontex == None: 260 if self.sparkcontex == None:
261 self.sparkcontex = SC.Sparker(host='HPC-server', appname='ImageCV', master='spark://HPC-server:7077') 261 self.sparkcontex = SC.Sparker(host='HPC-server', appname='ImageCV', master='spark://HPC-server:7077')
262 262
263 - result = self.sparkcontex.read_habase(self.table_name) # result = {key:[feat,tag],...} 263 + result = self.sparkcontex.read_hbase(self.table_name) # result = {key:[feat,tag],...}
264 for key, data in result.items(): 264 for key, data in result.items():
265 X.append(data[0]) 265 X.append(data[0])
266 Y.append(data[1]) 266 Y.append(data[1])
@@ -47,7 +47,7 @@ class Sparker(object): @@ -47,7 +47,7 @@ class Sparker(object):
47 47
48 self.model = None 48 self.model = None
49 49
50 - def read_habase(self, table_name, columns=None): 50 + def read_hbase(self, table_name, func=None, collect=False):
51 """ 51 """
52 ref - http://happybase.readthedocs.org/en/latest/user.html#retrieving-data 52 ref - http://happybase.readthedocs.org/en/latest/user.html#retrieving-data
53 53
@@ -59,7 +59,7 @@ class Sparker(object): @@ -59,7 +59,7 @@ class Sparker(object):
59 """ 59 """
60 hconf = {"hbase.zookeeper.quorum": self.host, 60 hconf = {"hbase.zookeeper.quorum": self.host,
61 "hbase.mapreduce.inputtable": table_name, 61 "hbase.mapreduce.inputtable": table_name,
62 - } 62 + }
63 63
64 hbase_rdd = self.sc.newAPIHadoopRDD(inputFormatClass=hparams["inputFormatClass"], 64 hbase_rdd = self.sc.newAPIHadoopRDD(inputFormatClass=hparams["inputFormatClass"],
65 keyClass=hparams["readKeyClass"], 65 keyClass=hparams["readKeyClass"],
@@ -67,11 +67,16 @@ class Sparker(object): @@ -67,11 +67,16 @@ class Sparker(object):
67 keyConverter=hparams["readKeyConverter"], 67 keyConverter=hparams["readKeyConverter"],
68 valueConverter=hparams["readValueConverter"], 68 valueConverter=hparams["readValueConverter"],
69 conf=hconf) 69 conf=hconf)
70 - hbase_rdd = hbase_rdd.map(lambda x: parse_cv(x))  
71 - output = hbase_rdd.collect()  
72 - return output  
73 70
74 - def write_habase(self, table_name, data): 71 + parser = func if func != None else parse_cv
  72 + hbase_rdd = hbase_rdd.map(lambda x: parser(x))
  73 +
  74 + if collect:
  75 + return hbase_rdd.collect()
  76 + else:
  77 + return hbase_rdd
  78 +
  79 + def write_hbase(self, table_name, data):
75 """ 80 """
76 Data Format: 81 Data Format:
77 e.g. [["row8", "f1", "", "caocao cao"], ["row9", "f1", "c1", "asdfg hhhh"]] 82 e.g. [["row8", "f1", "", "caocao cao"], ["row9", "f1", "c1", "asdfg hhhh"]]
@@ -82,7 +87,7 @@ class Sparker(object): @@ -82,7 +87,7 @@ class Sparker(object):
82 "mapreduce.outputformat.class": hparams["outputFormatClass"], 87 "mapreduce.outputformat.class": hparams["outputFormatClass"],
83 "mapreduce.job.output.key.class": hparams["writeKeyClass"], 88 "mapreduce.job.output.key.class": hparams["writeKeyClass"],
84 "mapreduce.job.output.value.class": hparams["writeValueClass"], 89 "mapreduce.job.output.value.class": hparams["writeValueClass"],
85 - } 90 + }
86 91
87 self.sc.parallelize(data).map(lambda x: (x[0], x)).saveAsNewAPIHadoopDataset( 92 self.sc.parallelize(data).map(lambda x: (x[0], x)).saveAsNewAPIHadoopDataset(
88 conf=hconf, 93 conf=hconf,