Commit 26616791c5ea340b07ab5fbeaf3d2922812d7d9f

Authored by Chunk
1 parent 13a594f1
Exists in master and in 1 other branch refactor

RDD-hbase bug fixed.(with 'repartition()')

Showing 2 changed files with 9 additions and 5 deletions   Show diff stats
@@ -247,9 +247,10 @@ def format_out(row, cols, withdata=False): @@ -247,9 +247,10 @@ def format_out(row, cols, withdata=False):
247 puts.append((key, [key] + col + [str(data)])) 247 puts.append((key, [key] + col + [str(data)]))
248 return puts 248 return puts
249 249
  250 +
250 # scconf = SparkConf() 251 # scconf = SparkConf()
251 # scconf.setSparkHome("HPC-server") \ 252 # scconf.setSparkHome("HPC-server") \
252 -# .setMaster("spark://HPC-server:7077") \ 253 +# .setMaster("spark://HPC-server:7077") \
253 # .setAppName("example") 254 # .setAppName("example")
254 # sc = SparkContext(conf=scconf) 255 # sc = SparkContext(conf=scconf)
255 # 256 #
@@ -342,7 +343,7 @@ class Sparker(object): @@ -342,7 +343,7 @@ class Sparker(object):
342 self.model = None 343 self.model = None
343 344
344 345
345 - def read_hbase(self, table_name, func=None, collect=False): 346 + def read_hbase(self, table_name, func=None, collect=False, parallelism=40):
346 """ 347 """
347 ref - http://happybase.readthedocs.org/en/latest/user.html#retrieving-data 348 ref - http://happybase.readthedocs.org/en/latest/user.html#retrieving-data
348 349
@@ -372,7 +373,7 @@ class Sparker(object): @@ -372,7 +373,7 @@ class Sparker(object):
372 if collect: 373 if collect:
373 return hbase_rdd.collect() 374 return hbase_rdd.collect()
374 else: 375 else:
375 - return hbase_rdd 376 + return hbase_rdd.repartition(parallelism)
376 377
377 def write_hbase(self, table_name, data, fromrdd=False, columns=None, withdata=False): 378 def write_hbase(self, table_name, data, fromrdd=False, columns=None, withdata=False):
378 """ 379 """
test/test_data.py
@@ -93,7 +93,7 @@ def test_ILSVRC_S_LOCAL(): @@ -93,7 +93,7 @@ def test_ILSVRC_S_LOCAL():
93 timer.report() 93 timer.report()
94 94
95 95
96 -def test_ILSVRC_S_SPARK(category='Train_200'): 96 +def test_ILSVRC_S_SPARK(category='Train_1000'):
97 timer = Timer() 97 timer = Timer()
98 98
99 timer.mark() 99 timer.mark()
@@ -102,7 +102,7 @@ def test_ILSVRC_S_SPARK(category='Train_200'): @@ -102,7 +102,7 @@ def test_ILSVRC_S_SPARK(category='Train_200'):
102 dil.format() 102 dil.format()
103 dil.store_img() 103 dil.store_img()
104 timer.report() 104 timer.report()
105 - return 105 + # return
106 106
107 dils = ILSVRC_S.DataILSVRC_S(base='ILSVRC2013_DET_val', category=category) 107 dils = ILSVRC_S.DataILSVRC_S(base='ILSVRC2013_DET_val', category=category)
108 108
@@ -110,6 +110,9 @@ def test_ILSVRC_S_SPARK(category='Train_200'): @@ -110,6 +110,9 @@ def test_ILSVRC_S_SPARK(category='Train_200'):
110 dils._extract_data(mode='spark', writeback=False) 110 dils._extract_data(mode='spark', writeback=False)
111 timer.report() 111 timer.report()
112 112
  113 + # print dils.rdd_data.count() # pass
  114 + # return
  115 +
113 timer.mark() 116 timer.mark()
114 dils._embed_data(mode='spark', rate=0.2, readforward=False, writeback=False) 117 dils._embed_data(mode='spark', rate=0.2, readforward=False, writeback=False)
115 timer.report() 118 timer.report()