Commit 26616791c5ea340b07ab5fbeaf3d2922812d7d9f
1 parent
13a594f1
Exists in
master
and in
1 other branch
RDD-hbase bug fixed.(with 'repartition()')
Showing
2 changed files
with
9 additions
and
5 deletions
Show diff stats
mspark/SC.py
| @@ -247,9 +247,10 @@ def format_out(row, cols, withdata=False): | @@ -247,9 +247,10 @@ def format_out(row, cols, withdata=False): | ||
| 247 | puts.append((key, [key] + col + [str(data)])) | 247 | puts.append((key, [key] + col + [str(data)])) |
| 248 | return puts | 248 | return puts |
| 249 | 249 | ||
| 250 | + | ||
| 250 | # scconf = SparkConf() | 251 | # scconf = SparkConf() |
| 251 | # scconf.setSparkHome("HPC-server") \ | 252 | # scconf.setSparkHome("HPC-server") \ |
| 252 | -# .setMaster("spark://HPC-server:7077") \ | 253 | +# .setMaster("spark://HPC-server:7077") \ |
| 253 | # .setAppName("example") | 254 | # .setAppName("example") |
| 254 | # sc = SparkContext(conf=scconf) | 255 | # sc = SparkContext(conf=scconf) |
| 255 | # | 256 | # |
| @@ -342,7 +343,7 @@ class Sparker(object): | @@ -342,7 +343,7 @@ class Sparker(object): | ||
| 342 | self.model = None | 343 | self.model = None |
| 343 | 344 | ||
| 344 | 345 | ||
| 345 | - def read_hbase(self, table_name, func=None, collect=False): | 346 | + def read_hbase(self, table_name, func=None, collect=False, parallelism=40): |
| 346 | """ | 347 | """ |
| 347 | ref - http://happybase.readthedocs.org/en/latest/user.html#retrieving-data | 348 | ref - http://happybase.readthedocs.org/en/latest/user.html#retrieving-data |
| 348 | 349 | ||
| @@ -372,7 +373,7 @@ class Sparker(object): | @@ -372,7 +373,7 @@ class Sparker(object): | ||
| 372 | if collect: | 373 | if collect: |
| 373 | return hbase_rdd.collect() | 374 | return hbase_rdd.collect() |
| 374 | else: | 375 | else: |
| 375 | - return hbase_rdd | 376 | + return hbase_rdd.repartition(parallelism) |
| 376 | 377 | ||
| 377 | def write_hbase(self, table_name, data, fromrdd=False, columns=None, withdata=False): | 378 | def write_hbase(self, table_name, data, fromrdd=False, columns=None, withdata=False): |
| 378 | """ | 379 | """ |
test/test_data.py
| @@ -93,7 +93,7 @@ def test_ILSVRC_S_LOCAL(): | @@ -93,7 +93,7 @@ def test_ILSVRC_S_LOCAL(): | ||
| 93 | timer.report() | 93 | timer.report() |
| 94 | 94 | ||
| 95 | 95 | ||
| 96 | -def test_ILSVRC_S_SPARK(category='Train_200'): | 96 | +def test_ILSVRC_S_SPARK(category='Train_1000'): |
| 97 | timer = Timer() | 97 | timer = Timer() |
| 98 | 98 | ||
| 99 | timer.mark() | 99 | timer.mark() |
| @@ -102,7 +102,7 @@ def test_ILSVRC_S_SPARK(category='Train_200'): | @@ -102,7 +102,7 @@ def test_ILSVRC_S_SPARK(category='Train_200'): | ||
| 102 | dil.format() | 102 | dil.format() |
| 103 | dil.store_img() | 103 | dil.store_img() |
| 104 | timer.report() | 104 | timer.report() |
| 105 | - return | 105 | + # return |
| 106 | 106 | ||
| 107 | dils = ILSVRC_S.DataILSVRC_S(base='ILSVRC2013_DET_val', category=category) | 107 | dils = ILSVRC_S.DataILSVRC_S(base='ILSVRC2013_DET_val', category=category) |
| 108 | 108 | ||
| @@ -110,6 +110,9 @@ def test_ILSVRC_S_SPARK(category='Train_200'): | @@ -110,6 +110,9 @@ def test_ILSVRC_S_SPARK(category='Train_200'): | ||
| 110 | dils._extract_data(mode='spark', writeback=False) | 110 | dils._extract_data(mode='spark', writeback=False) |
| 111 | timer.report() | 111 | timer.report() |
| 112 | 112 | ||
| 113 | + # print dils.rdd_data.count() # pass | ||
| 114 | + # return | ||
| 115 | + | ||
| 113 | timer.mark() | 116 | timer.mark() |
| 114 | dils._embed_data(mode='spark', rate=0.2, readforward=False, writeback=False) | 117 | dils._embed_data(mode='spark', rate=0.2, readforward=False, writeback=False) |
| 115 | timer.report() | 118 | timer.report() |