Commit 54e2adda230421077d556c3d775d9f399b82652e
1 parent
d642d837
Exists in
master
and in
1 other branch
staged.
Showing
3 changed files
with
179 additions
and
58 deletions
Show diff stats
mspark/SC.py
| @@ -247,6 +247,77 @@ def format_out(row, cols, withdata=False): | @@ -247,6 +247,77 @@ def format_out(row, cols, withdata=False): | ||
| 247 | puts.append((key, [key] + col + [str(data)])) | 247 | puts.append((key, [key] + col + [str(data)])) |
| 248 | return puts | 248 | return puts |
| 249 | 249 | ||
| 250 | +# scconf = SparkConf() | ||
| 251 | +# scconf.setSparkHome("HPC-server") \ | ||
| 252 | +# .setMaster("spark://HPC-server:7077") \ | ||
| 253 | +# .setAppName("example") | ||
| 254 | +# sc = SparkContext(conf=scconf) | ||
| 255 | +# | ||
| 256 | +# | ||
| 257 | +# def read_hbase(table_name, func=None, collect=False): | ||
| 258 | +# """ | ||
| 259 | +# ref - http://happybase.readthedocs.org/en/latest/user.html#retrieving-data | ||
| 260 | +# | ||
| 261 | +# Filter format: | ||
| 262 | +# columns=['cf1:col1', 'cf1:col2'] | ||
| 263 | +# or | ||
| 264 | +# columns=['cf1'] | ||
| 265 | +# | ||
| 266 | +# """ | ||
| 267 | +# | ||
| 268 | +# hconf = { | ||
| 269 | +# "hbase.zookeeper.quorum": "HPC-server, HPC, HPC2", | ||
| 270 | +# # "hbase.zookeeper.quorum": self.host, | ||
| 271 | +# "hbase.mapreduce.inputtable": table_name, | ||
| 272 | +# } | ||
| 273 | +# | ||
| 274 | +# hbase_rdd = sc.newAPIHadoopRDD(inputFormatClass=hparams["inputFormatClass"], | ||
| 275 | +# keyClass=hparams["readKeyClass"], | ||
| 276 | +# valueClass=hparams["readValueClass"], | ||
| 277 | +# keyConverter=hparams["readKeyConverter"], | ||
| 278 | +# valueConverter=hparams["readValueConverter"], | ||
| 279 | +# conf=hconf) | ||
| 280 | +# | ||
| 281 | +# parser = func if func != None else rddparse_data_CV | ||
| 282 | +# hbase_rdd = hbase_rdd.map(lambda x: parser(x)) | ||
| 283 | +# | ||
| 284 | +# if collect: | ||
| 285 | +# return hbase_rdd.collect() | ||
| 286 | +# else: | ||
| 287 | +# return hbase_rdd | ||
| 288 | +# | ||
| 289 | +# | ||
| 290 | +# def write_hbase(table_name, data, fromrdd=False, columns=None, withdata=False): | ||
| 291 | +# """ | ||
| 292 | +# Data Format: (Deprecated) | ||
| 293 | +# e.g. [["row8", "f1", "", "caocao cao"], ["row9", "f1", "c1", "asdfg hhhh"]] | ||
| 294 | +# | ||
| 295 | +# Data(from dictionary): | ||
| 296 | +# e.g. data ={'row1':[1,3400,'hello'], 'row2':[34,5000,'here in mine']}, | ||
| 297 | +# cols = ['cf_info:id', 'cf_info:size', 'cf_tag:desc'] | ||
| 298 | +# Data(from Rdd): | ||
| 299 | +# e.g. data =[('row1',[1,3400,'hello']), ('row2',[34,5000,'here in mine'])], | ||
| 300 | +# cols = ['cf_info:id', 'cf_info:size', 'cf_tag:desc'] | ||
| 301 | +# """ | ||
| 302 | +# hconf = { | ||
| 303 | +# "hbase.zookeeper.quorum": "HPC-server, HPC, HPC2", # "hbase.zookeeper.quorum": self.host, | ||
| 304 | +# "hbase.mapreduce.inputtable": table_name, | ||
| 305 | +# "hbase.mapred.outputtable": table_name, | ||
| 306 | +# "mapreduce.outputformat.class": hparams["outputFormatClass"], | ||
| 307 | +# "mapreduce.job.output.key.class": hparams["writeKeyClass"], | ||
| 308 | +# "mapreduce.job.output.value.class": hparams["writeValueClass"], | ||
| 309 | +# } | ||
| 310 | +# cols = [col.split(':') for col in columns] | ||
| 311 | +# if not fromrdd: | ||
| 312 | +# rdd_data = sc.parallelize(data) | ||
| 313 | +# else: | ||
| 314 | +# rdd_data = data | ||
| 315 | +# | ||
| 316 | +# rdd_data.flatMap(lambda x: format_out(x, cols, withdata=withdata)).saveAsNewAPIHadoopDataset( | ||
| 317 | +# conf=hconf, | ||
| 318 | +# keyConverter=hparams["writeKeyConverter"], | ||
| 319 | +# valueConverter=hparams["writeValueConverter"]) | ||
| 320 | + | ||
| 250 | 321 | ||
| 251 | class Sparker(object): | 322 | class Sparker(object): |
| 252 | def __init__(self, host='HPC-server', appname='NewPySparkApp', **kwargs): | 323 | def __init__(self, host='HPC-server', appname='NewPySparkApp', **kwargs): |
| @@ -283,9 +354,9 @@ class Sparker(object): | @@ -283,9 +354,9 @@ class Sparker(object): | ||
| 283 | """ | 354 | """ |
| 284 | 355 | ||
| 285 | hconf = { | 356 | hconf = { |
| 286 | - "hbase.zookeeper.quorum": "HPC-server, HPC, HPC2", | ||
| 287 | - # "hbase.zookeeper.quorum": self.host, | ||
| 288 | - "hbase.mapreduce.inputtable": table_name, | 357 | + "hbase.zookeeper.quorum": "HPC-server, HPC, HPC2", |
| 358 | + # "hbase.zookeeper.quorum": self.host, | ||
| 359 | + "hbase.mapreduce.inputtable": table_name, | ||
| 289 | } | 360 | } |
| 290 | 361 | ||
| 291 | hbase_rdd = self.sc.newAPIHadoopRDD(inputFormatClass=hparams["inputFormatClass"], | 362 | hbase_rdd = self.sc.newAPIHadoopRDD(inputFormatClass=hparams["inputFormatClass"], |
| @@ -316,12 +387,12 @@ class Sparker(object): | @@ -316,12 +387,12 @@ class Sparker(object): | ||
| 316 | cols = ['cf_info:id', 'cf_info:size', 'cf_tag:desc'] | 387 | cols = ['cf_info:id', 'cf_info:size', 'cf_tag:desc'] |
| 317 | """ | 388 | """ |
| 318 | hconf = { | 389 | hconf = { |
| 319 | - "hbase.zookeeper.quorum": "HPC-server, HPC, HPC2", # "hbase.zookeeper.quorum": self.host, | ||
| 320 | - "hbase.mapreduce.inputtable": table_name, | ||
| 321 | - "hbase.mapred.outputtable": table_name, | ||
| 322 | - "mapreduce.outputformat.class": hparams["outputFormatClass"], | ||
| 323 | - "mapreduce.job.output.key.class": hparams["writeKeyClass"], | ||
| 324 | - "mapreduce.job.output.value.class": hparams["writeValueClass"], | 390 | + "hbase.zookeeper.quorum": "HPC-server, HPC, HPC2", # "hbase.zookeeper.quorum": self.host, |
| 391 | + "hbase.mapreduce.inputtable": table_name, | ||
| 392 | + "hbase.mapred.outputtable": table_name, | ||
| 393 | + "mapreduce.outputformat.class": hparams["outputFormatClass"], | ||
| 394 | + "mapreduce.job.output.key.class": hparams["writeKeyClass"], | ||
| 395 | + "mapreduce.job.output.value.class": hparams["writeValueClass"], | ||
| 325 | } | 396 | } |
| 326 | cols = [col.split(':') for col in columns] | 397 | cols = [col.split(':') for col in columns] |
| 327 | if not fromrdd: | 398 | if not fromrdd: |
test/test_data.py
| @@ -93,13 +93,13 @@ def test_ILSVRC_S_LOCAL(): | @@ -93,13 +93,13 @@ def test_ILSVRC_S_LOCAL(): | ||
| 93 | def test_ILSVRC_S_SPARK(): | 93 | def test_ILSVRC_S_SPARK(): |
| 94 | timer = Timer() | 94 | timer = Timer() |
| 95 | 95 | ||
| 96 | - # timer.mark() | ||
| 97 | - # dil = ILSVRC.DataILSVRC(base_dir='/data/hadoop/ImageNet/ILSVRC/ILSVRC2013_DET_val', category='Test_1') | ||
| 98 | - # dil.delete_table() | ||
| 99 | - # dil.format() | ||
| 100 | - # dil.store_img() | ||
| 101 | - # timer.report() | ||
| 102 | - # return | 96 | + timer.mark() |
| 97 | + dil = ILSVRC.DataILSVRC(base_dir='/data/hadoop/ImageNet/ILSVRC/ILSVRC2013_DET_val', category='Test_1') | ||
| 98 | + dil.delete_table() | ||
| 99 | + dil.format() | ||
| 100 | + dil.store_img() | ||
| 101 | + timer.report() | ||
| 102 | + return | ||
| 103 | 103 | ||
| 104 | dils = ILSVRC_S.DataILSVRC_S(base='ILSVRC2013_DET_val', category='Test_1') | 104 | dils = ILSVRC_S.DataILSVRC_S(base='ILSVRC2013_DET_val', category='Test_1') |
| 105 | 105 |
test/test_whole.py
| @@ -2,48 +2,98 @@ __author__ = 'chunk' | @@ -2,48 +2,98 @@ __author__ = 'chunk' | ||
| 2 | 2 | ||
| 3 | from ..mspark import SC | 3 | from ..mspark import SC |
| 4 | from pyspark.mllib.regression import LabeledPoint | 4 | from pyspark.mllib.regression import LabeledPoint |
| 5 | - | ||
| 6 | - | ||
| 7 | -cols0 = [ | ||
| 8 | - 'cf_pic:data', | ||
| 9 | - 'cf_info:width', | ||
| 10 | - 'cf_info:height', | ||
| 11 | - 'cf_info:size', | ||
| 12 | - 'cf_info:capacity', | ||
| 13 | - 'cf_info:quality', | ||
| 14 | - 'cf_info:rate', | ||
| 15 | - 'cf_tag:chosen', | ||
| 16 | - 'cf_tag:class' | ||
| 17 | -] | ||
| 18 | -cols1 = [ | ||
| 19 | - 'cf_pic:data', | ||
| 20 | - 'cf_info:width', | ||
| 21 | - 'cf_info:height', | ||
| 22 | - 'cf_info:size', | ||
| 23 | - 'cf_info:capacity', | ||
| 24 | - 'cf_info:quality', | ||
| 25 | - 'cf_info:rate', | ||
| 26 | - 'cf_tag:chosen', | ||
| 27 | - 'cf_tag:class', | ||
| 28 | - 'cf_feat:bid', | ||
| 29 | -] | ||
| 30 | - | ||
| 31 | -sparker = SC.Sparker(host='HPC-server', appname='ImageILSVRC-S', master='spark://HPC-server:7077') | ||
| 32 | - | ||
| 33 | -rdd_data = sparker.read_hbase("ILSVRC2013_DET_val-Test_1", func=SC.rddparse_data_ILS, collect=False) \ | ||
| 34 | - .mapValues(lambda data: [data] + SC.rddinfo_ILS(data)) \ | ||
| 35 | - .flatMap(lambda x: SC.rddembed_ILS_EXT(x, rate=0.2)) \ | ||
| 36 | - .mapValues(lambda items: SC.rddfeat_ILS(items)) | ||
| 37 | - | ||
| 38 | -sparker.write_hbase("ILSVRC2013_DET_val-Test_1", rdd_data, fromrdd=True, columns=cols1, | ||
| 39 | - withdata=True) | ||
| 40 | - | ||
| 41 | - | ||
| 42 | - | ||
| 43 | - | ||
| 44 | - | ||
| 45 | - | ||
| 46 | - | 5 | +import happybase |
| 6 | + | ||
| 7 | +def test_whole(): | ||
| 8 | + cols0 = [ | ||
| 9 | + 'cf_pic:data', | ||
| 10 | + 'cf_info:width', | ||
| 11 | + 'cf_info:height', | ||
| 12 | + 'cf_info:size', | ||
| 13 | + 'cf_info:capacity', | ||
| 14 | + 'cf_info:quality', | ||
| 15 | + 'cf_info:rate', | ||
| 16 | + 'cf_tag:chosen', | ||
| 17 | + 'cf_tag:class' | ||
| 18 | + ] | ||
| 19 | + cols1 = [ | ||
| 20 | + 'cf_pic:data', | ||
| 21 | + 'cf_info:width', | ||
| 22 | + 'cf_info:height', | ||
| 23 | + 'cf_info:size', | ||
| 24 | + 'cf_info:capacity', | ||
| 25 | + 'cf_info:quality', | ||
| 26 | + 'cf_info:rate', | ||
| 27 | + 'cf_tag:chosen', | ||
| 28 | + 'cf_tag:class', | ||
| 29 | + 'cf_feat:bid', | ||
| 30 | + ] | ||
| 31 | + | ||
| 32 | + sparker = SC.Sparker(host='HPC-server', appname='ImageILSVRC-S', master='spark://HPC-server:7077') | ||
| 33 | + | ||
| 34 | + # rdd_data = sparker.read_hbase("ILSVRC2013_DET_val-Test_1", func=SC.rddparse_data_ILS, collect=False) \ | ||
| 35 | + # .mapValues(lambda data: [data] + SC.rddinfo_ILS(data)) \ | ||
| 36 | + # .flatMap(lambda x: SC.rddembed_ILS_EXT(x, rate=0.2)) \ | ||
| 37 | + # .mapValues(lambda items: SC.rddfeat_ILS(items)) | ||
| 38 | + | ||
| 39 | + rdd_data = sparker.read_hbase("ILSVRC2013_DET_val-Test_1", func=SC.rddparse_data_ILS, collect=False).mapValues( | ||
| 40 | + lambda data: [data] + SC.rddinfo_ILS(data)) | ||
| 41 | + rdd_data_ext = rdd_data.map(lambda x: SC.rddembed_ILS(x, rate=0.2)).filter(lambda x: x != None) | ||
| 42 | + | ||
| 43 | + rdd_data = rdd_data.union(rdd_data_ext).mapValues(lambda items: SC.rddfeat_ILS(items)) | ||
| 44 | + | ||
| 45 | + print len(rdd_data.collect()) | ||
| 46 | + | ||
| 47 | + # sparker.write_hbase("ILSVRC2013_DET_val-Test_1", rdd_data, fromrdd=True, columns=cols1, | ||
| 48 | + # withdata=True) | ||
| 49 | + | ||
| 50 | + | ||
| 51 | +def test_whole_ext(): | ||
| 52 | + table_name = "ILSVRC2013_DET_val-Test_1" | ||
| 53 | + connection = happybase.Connection('HPC-server') | ||
| 54 | + tables = connection.tables() | ||
| 55 | + if table_name not in tables: | ||
| 56 | + families = {'cf_pic': dict(), | ||
| 57 | + 'cf_info': dict(max_versions=10), | ||
| 58 | + 'cf_tag': dict(), | ||
| 59 | + 'cf_feat': dict(), | ||
| 60 | + } | ||
| 61 | + connection.create_table(name=table_name, families=families) | ||
| 62 | + table = connection.table(name=table_name) | ||
| 63 | + | ||
| 64 | + cols = ['cf_pic:data'] | ||
| 65 | + list_data = [] | ||
| 66 | + for key, data in table.scan(columns=cols): | ||
| 67 | + data = data['cf_pic:data'] | ||
| 68 | + list_data.append((key,data)) | ||
| 69 | + | ||
| 70 | + sparker = SC.Sparker(host='HPC-server', appname='ImageILSVRC-S', master='spark://HPC-server:7077') | ||
| 71 | + rdd_data = sparker.sc.parallelize(list_data,20)\ | ||
| 72 | + .mapValues(lambda data: [data] + SC.rddinfo_ILS(data))\ | ||
| 73 | + .flatMap(lambda x: SC.rddembed_ILS_EXT(x, rate=0.2))\ | ||
| 74 | + .mapValues(lambda items: SC.rddfeat_ILS(items)) | ||
| 75 | + | ||
| 76 | + rrr = rdd_data.collect() | ||
| 77 | + print "-----------------",len(rrr),"====================" | ||
| 78 | + print "+++++++++++++++++",rrr[0],"**********************" | ||
| 79 | + # try: | ||
| 80 | + # with table.batch(batch_size=5000) as b: | ||
| 81 | + # for imgname, imginfo in rdd_data.collect().items(): | ||
| 82 | + # b.put(imgname, | ||
| 83 | + # { | ||
| 84 | + # 'cf_pic:data': imginfo[0], | ||
| 85 | + # 'cf_info:width': str(imginfo[1]), | ||
| 86 | + # 'cf_info:height': str(imginfo[2]), | ||
| 87 | + # 'cf_info:size': str(imginfo[3]), | ||
| 88 | + # 'cf_info:capacity': str(imginfo[4]), | ||
| 89 | + # 'cf_info:quality': str(imginfo[5]), | ||
| 90 | + # 'cf_info:rate': str(imginfo[6]), | ||
| 91 | + # 'cf_tag:chosen': str(imginfo[7]), | ||
| 92 | + # 'cf_tag:class': str(imginfo[8]), | ||
| 93 | + # 'cf_feat:' + feattype: imginfo[9], | ||
| 94 | + # }) | ||
| 95 | + # except ValueError: | ||
| 96 | + # raise | ||
| 47 | 97 | ||
| 48 | 98 | ||
| 49 | 99 |