Commit 8bddd8b35aabd5333192647471c7e198890081cf

Authored by Chunk
1 parent 1c2a3fa0
Exists in master and in 1 other branch refactor

You guess what? Through all the 'debugs' and 'f**ks' finally we have finished the pyspark&hbase!

mdata/ILSVRC_S.py
@@ -361,7 +361,7 @@ class DataILSVRC_S(DataDumperBase): @@ -361,7 +361,7 @@ class DataILSVRC_S(DataDumperBase):
361 if readforward: 361 if readforward:
362 self.rdd_data = self.sparkcontex.read_hbase(self.table_name, func=SC.rddparse_all_ILS, collect=False) 362 self.rdd_data = self.sparkcontex.read_hbase(self.table_name, func=SC.rddparse_all_ILS, collect=False)
363 363
364 - rdd_data_ext = self.rdd_data.map(lambda x: SC.rddembed_ILS(x)) 364 + rdd_data_ext = self.rdd_data.map(lambda x: SC.rddembed_ILS(x)).filter(lambda x: x != None)
365 self.rdd_data = self.rdd_data.union(rdd_data_ext) 365 self.rdd_data = self.rdd_data.union(rdd_data_ext)
366 366
367 if not writeback: 367 if not writeback:
@@ -454,7 +454,10 @@ class DataILSVRC_S(DataDumperBase): @@ -454,7 +454,10 @@ class DataILSVRC_S(DataDumperBase):
454 if readforward: 454 if readforward:
455 self.rdd_data = self.sparkcontex.read_hbase(self.table_name, func=SC.rddparse_all_ILS, collect=False) 455 self.rdd_data = self.sparkcontex.read_hbase(self.table_name, func=SC.rddparse_all_ILS, collect=False)
456 456
457 - self.rdd_data = self.rdd_data.map(lambda x: SC.rddfeat_ILS(x)) 457 + self.rdd_data = self.rdd_data.mapValues(lambda items: SC.rddfeat_ILS(items))
  458 +
  459 + # print self.rdd_data.collect()[0]
  460 + # return
458 461
459 if not writeback: 462 if not writeback:
460 return self.rdd_data 463 return self.rdd_data
@@ -56,9 +56,16 @@ def rddparse_data_ILS(raw_row): @@ -56,9 +56,16 @@ def rddparse_data_ILS(raw_row):
56 56
57 57
58 def rddparse_all_ILS(raw_row): 58 def rddparse_all_ILS(raw_row):
  59 + """
  60 + Deprecated
  61 + """
59 key = raw_row[0] 62 key = raw_row[0]
60 items = raw_row[1].decode('unicode-escape').encode('latin-1').split('--%--') 63 items = raw_row[1].decode('unicode-escape').encode('latin-1').split('--%--')
  64 +
  65 + # @TODO
  66 + # N.B "ValueError: No JSON object could be decoded" Because the spark-hbase IO is based on strings.
61 data = [items[0].split('cf_pic:data:')[-1]] + [json.loads(item.split(':')[-1]) for item in items[1:]] 67 data = [items[0].split('cf_pic:data:')[-1]] + [json.loads(item.split(':')[-1]) for item in items[1:]]
  68 +
62 return (key, data) 69 return (key, data)
63 70
64 71
@@ -145,17 +152,15 @@ def _get_feat(image, feattype='ibd', **kwargs): @@ -145,17 +152,15 @@ def _get_feat(image, feattype='ibd', **kwargs):
145 return desc 152 return desc
146 153
147 154
148 -def rddfeat_ILS(row, feattype='ibd', **kwargs):  
149 - items = row[1]  
150 - capacity, rate, chosen = int(items[4]), float(items[6]), int(items[7]) 155 +def rddfeat_ILS(items, feattype='ibd', **kwargs):
151 try: 156 try:
152 tmpf_src = tempfile.NamedTemporaryFile(suffix='.jpg', mode='w+b') 157 tmpf_src = tempfile.NamedTemporaryFile(suffix='.jpg', mode='w+b')
153 tmpf_src.write(items[0]) 158 tmpf_src.write(items[0])
154 tmpf_src.seek(0) 159 tmpf_src.seek(0)
155 160
156 desc = json.dumps(_get_feat(tmpf_src.name, feattype=feattype).tolist()) 161 desc = json.dumps(_get_feat(tmpf_src.name, feattype=feattype).tolist())
157 -  
158 - return (row[0], row[1].append(desc)) 162 + # print 'desccccccccccccccccccc',desc
  163 + return items + [desc]
159 164
160 except Exception as e: 165 except Exception as e:
161 print e 166 print e
@@ -174,6 +179,8 @@ def format_out(row, cols): @@ -174,6 +179,8 @@ def format_out(row, cols):
174 """ 179 """
175 puts = [] 180 puts = []
176 key = row[0] 181 key = row[0]
  182 + if key == '04650c488a2b163ca8a1f52da6022f03.jpg':
  183 + print row
177 for data, col in zip(row[1], cols): 184 for data, col in zip(row[1], cols):
178 puts.append((key, [key] + col + [str(data)])) 185 puts.append((key, [key] + col + [str(data)]))
179 return puts 186 return puts
res/toembed
@@ -1,2 +0,0 @@ @@ -1,2 +0,0 @@
1 -3.段描述符是GDT和LDT表中的一个数据结构项,用于向处理器提供有关一个段的位置和大小信息以及访问控制的状态信息。每个段描述符的长度是8字节,含有3个主要字段:段基地址、段限长和段属性。根据段类型中的段扩展方向标志E,处理器以两种不同方式使用段限长Limit。对于向上扩展的段(简称上扩段),逻辑地址中的偏移值范围可以从0到段限长值Limit。大于段限长Limit的偏移值将产生一般保护性异常。对于向下扩展的段(简称下扩段),段限长Limit的含义相反。  
2 -(段长度为什么是20位呢?4GB)  
res/toembed.bak 0 → 100644
@@ -0,0 +1,2 @@ @@ -0,0 +1,2 @@
  1 +3.段描述符是GDT和LDT表中的一个数据结构项,用于向处理器提供有关一个段的位置和大小信息以及访问控制的状态信息。每个段描述符的长度是8字节,含有3个主要字段:段基地址、段限长和段属性。根据段类型中的段扩展方向标志E,处理器以两种不同方式使用段限长Limit。对于向上扩展的段(简称上扩段),逻辑地址中的偏移值范围可以从0到段限长值Limit。大于段限长Limit的偏移值将产生一般保护性异常。对于向下扩展的段(简称下扩段),段限长Limit的含义相反。
  2 +(段长度为什么是20位呢?4GB)
test/test_data.py
@@ -72,9 +72,9 @@ def test_ILSVRC_S(): @@ -72,9 +72,9 @@ def test_ILSVRC_S():
72 72
73 # dils.format() 73 # dils.format()
74 74
75 - # dils._extract_data(mode='spark', writeback=True)  
76 - dils._embed_data(mode='spark', rate=0.1, readforward=True, writeback=True)  
77 - # dils._extract_feat( mode='spark', feattype='ibd', readforward=False, writeback=False) 75 + dils._extract_data(mode='spark', writeback=False)
  76 + dils._embed_data(mode='spark', rate=0.1, readforward=False, writeback=False)
  77 + dils._extract_feat( mode='spark', feattype='ibd', readforward=False, writeback=True)
78 78
79 79
80 if __name__ == '__main__': 80 if __name__ == '__main__':