Commit ec755e37654e61074fe94b3fb10932bb14c0d856

Authored by Chunk
1 parent 2fe06b7f
Exists in master and in 1 other branch refactor

cropping.

Showing 2 changed files with 74 additions and 14 deletions   Show diff stats
mdata/ILSVRC.py
@@ -22,6 +22,7 @@ import numpy as np @@ -22,6 +22,7 @@ import numpy as np
22 from numpy.random import randn 22 from numpy.random import randn
23 import pandas as pd 23 import pandas as pd
24 from scipy import stats 24 from scipy import stats
  25 +import random
25 26
26 from subprocess import Popen, PIPE, STDOUT 27 from subprocess import Popen, PIPE, STDOUT
27 28
@@ -268,6 +269,39 @@ class DataILSVRC(DataDumperBase): @@ -268,6 +269,39 @@ class DataILSVRC(DataDumperBase):
268 def embed(self, rate=None): 269 def embed(self, rate=None):
269 self._embed_inner(rate) 270 self._embed_inner(rate)
270 271
  272 +
  273 + def crop(self, size=(300, 300)):
  274 + for path, subdirs, files in os.walk(self.data_dir):
  275 + for name in files:
  276 + image = os.path.join(path, name)
  277 + print image
  278 +
  279 + W, H = size
  280 + try:
  281 + im = Image.open(image)
  282 + w, h = im.size
  283 + if w < W or h < H:
  284 + continue
  285 + left, upper = random.randint(0, w - W), random.randint(0, h - H)
  286 + im = im.crop((left, upper, left + W, upper + H))
  287 + im.save(os.path.join(self.data_dir + '_crop_pil', name))
  288 + except Exception as e:
  289 + print '[EXCPT]', e
  290 + pass
  291 +
  292 + # try:
  293 + # img = cv2.imread(image, cv2.CV_LOAD_IMAGE_UNCHANGED)
  294 + # h, w = img.shape[:2]
  295 + # if w < 300 or h < 300:
  296 + # continue
  297 + # left, upper = random.randint(0, w - 300), random.randint(0, h - 300)
  298 + # img_crop = img[upper:upper + 300, left:left + 300]
  299 + # cv2.imwrite(os.path.join(base_dir, category + '_crop_cv', name), img_crop)
  300 + # except Exception as e:
  301 + # print '[EXCPT]', e
  302 + # pass
  303 +
  304 +
271 def get_table(self): 305 def get_table(self):
272 if self.table != None: 306 if self.table != None:
273 return self.table 307 return self.table
@@ -410,30 +444,44 @@ class DataILSVRC(DataDumperBase): @@ -410,30 +444,44 @@ class DataILSVRC(DataDumperBase):
410 444
411 dict_dataset = {} 445 dict_dataset = {}
412 446
413 - with open(self.list_file, 'rb') as tsvfile:  
414 - tsvfile = csv.reader(tsvfile, delimiter='\t')  
415 - for line in tsvfile:  
416 - hash = line[0]  
417 - tag = line[-1]  
418 - path_feat = os.path.join(self.feat_dir, hash[:3], hash[3:] + '.' + feattype)  
419 - if path_feat:  
420 - with open(path_feat, 'rb') as featfile:  
421 - dict_dataset[hash] = (tag, json.loads(featfile.read())) 447 + if feattype == 'coef': # raw
  448 + with open(self.list_file, 'rb') as tsvfile:
  449 + tsvfile = csv.reader(tsvfile, delimiter='\t')
  450 + for line in tsvfile:
  451 + hash = line[0]
  452 + tag = line[-1]
  453 + image = os.path.join(self.feat_dir, hash[:3], hash[3:] + '.jpg')
  454 + if image:
  455 + im = Jpeg(image, key=sample_key)
  456 + dict_dataset[hash] = (tag, im.getCoefBlocks('Y'))
  457 +
  458 + else:
  459 + with open(self.list_file, 'rb') as tsvfile:
  460 + tsvfile = csv.reader(tsvfile, delimiter='\t')
  461 + for line in tsvfile:
  462 + hash = line[0]
  463 + tag = line[-1]
  464 + path_feat = os.path.join(self.feat_dir, hash[:3], hash[3:] + '.' + feattype)
  465 + if path_feat:
  466 + with open(path_feat, 'rb') as featfile:
  467 + dict_dataset[hash] = (tag, json.loads(featfile.read()))
422 468
423 for tag, feat in dict_dataset.values(): 469 for tag, feat in dict_dataset.values():
424 - X.append([item for sublist in feat for subsublist in sublist for item in subsublist]) 470 + # X.append([item for sublist in feat for subsublist in sublist for item in subsublist])
  471 + X.append(np.array(feat).ravel().tolist())
425 Y.append(int(tag)) 472 Y.append(int(tag))
426 473
427 - elif mode == "remote" or mode == "hbase": 474 + elif mode == "hbase": # remote
428 if self.table == None: 475 if self.table == None:
429 self.table = self.get_table() 476 self.table = self.get_table()
430 477
431 col_feat, col_tag = 'cf_feat:' + feattype, 'cf_tag:' + tagtype 478 col_feat, col_tag = 'cf_feat:' + feattype, 'cf_tag:' + tagtype
432 for key, data in self.table.scan(columns=[col_feat, col_tag]): 479 for key, data in self.table.scan(columns=[col_feat, col_tag]):
433 - X.append([item for sublist in json.loads(data[col_feat]) for subsublist in sublist for item in subsublist]) 480 + X.append(
  481 + [item for sublist in json.loads(data[col_feat]) for subsublist in sublist for item in subsublist])
434 Y.append(int(data[col_tag])) 482 Y.append(int(data[col_tag]))
435 483
436 - elif mode == "spark" or mode == "cluster": 484 + elif mode == "spark": # cluster
437 if self.sparker == None: 485 if self.sparker == None:
438 self.sparker = SC.Sparker(host='HPC-server', appname='ImageCV', master='spark://HPC-server:7077') 486 self.sparker = SC.Sparker(host='HPC-server', appname='ImageCV', master='spark://HPC-server:7077')
439 487
test/test_data.py
@@ -135,7 +135,19 @@ def test_pipeline(): @@ -135,7 +135,19 @@ def test_pipeline():
135 135
136 136
137 def test_crop(): 137 def test_crop():
138 - crop.crop_Test() 138 + # crop.crop_Test()
  139 + dil = ILSVRC.DataILSVRC(base_dir='/data/hadoop/ImageNet/ILSVRC/ILSVRC2013_DET_val', category='Test_1')
  140 + dil.crop()
  141 +
  142 + dil2 = ILSVRC.DataILSVRC(base_dir='/data/hadoop/ImageNet/ILSVRC/ILSVRC2013_DET_val', category='Test_1_crop_pil')
  143 +
  144 + dil.format()
  145 + dil.embed(rate=0.2)
  146 +
  147 + X,Y = dil2.load_data(mode='local',feattype='coef')
  148 + print X[0]
  149 + print Y
  150 +
139 151
140 if __name__ == '__main__': 152 if __name__ == '__main__':
141 # test_MSR() 153 # test_MSR()