Commit ec755e37654e61074fe94b3fb10932bb14c0d856

Authored by Chunk
1 parent 2fe06b7f
Exists in master and in 1 other branch refactor

cropping.

Showing 2 changed files with 74 additions and 14 deletions   Show diff stats
mdata/ILSVRC.py
... ... @@ -22,6 +22,7 @@ import numpy as np
22 22 from numpy.random import randn
23 23 import pandas as pd
24 24 from scipy import stats
  25 +import random
25 26  
26 27 from subprocess import Popen, PIPE, STDOUT
27 28  
... ... @@ -268,6 +269,39 @@ class DataILSVRC(DataDumperBase):
268 269 def embed(self, rate=None):
269 270 self._embed_inner(rate)
270 271  
  272 +
  273 + def crop(self, size=(300, 300)):
  274 + for path, subdirs, files in os.walk(self.data_dir):
  275 + for name in files:
  276 + image = os.path.join(path, name)
  277 + print image
  278 +
  279 + W, H = size
  280 + try:
  281 + im = Image.open(image)
  282 + w, h = im.size
  283 + if w < W or h < H:
  284 + continue
  285 + left, upper = random.randint(0, w - W), random.randint(0, h - H)
  286 + im = im.crop((left, upper, left + W, upper + H))
  287 + im.save(os.path.join(self.data_dir + '_crop_pil', name))
  288 + except Exception as e:
  289 + print '[EXCPT]', e
  290 + pass
  291 +
  292 + # try:
  293 + # img = cv2.imread(image, cv2.CV_LOAD_IMAGE_UNCHANGED)
  294 + # h, w = img.shape[:2]
  295 + # if w < 300 or h < 300:
  296 + # continue
  297 + # left, upper = random.randint(0, w - 300), random.randint(0, h - 300)
  298 + # img_crop = img[upper:upper + 300, left:left + 300]
  299 + # cv2.imwrite(os.path.join(base_dir, category + '_crop_cv', name), img_crop)
  300 + # except Exception as e:
  301 + # print '[EXCPT]', e
  302 + # pass
  303 +
  304 +
271 305 def get_table(self):
272 306 if self.table != None:
273 307 return self.table
... ... @@ -410,30 +444,44 @@ class DataILSVRC(DataDumperBase):
410 444  
411 445 dict_dataset = {}
412 446  
413   - with open(self.list_file, 'rb') as tsvfile:
414   - tsvfile = csv.reader(tsvfile, delimiter='\t')
415   - for line in tsvfile:
416   - hash = line[0]
417   - tag = line[-1]
418   - path_feat = os.path.join(self.feat_dir, hash[:3], hash[3:] + '.' + feattype)
419   - if path_feat:
420   - with open(path_feat, 'rb') as featfile:
421   - dict_dataset[hash] = (tag, json.loads(featfile.read()))
  447 + if feattype == 'coef': # raw
  448 + with open(self.list_file, 'rb') as tsvfile:
  449 + tsvfile = csv.reader(tsvfile, delimiter='\t')
  450 + for line in tsvfile:
  451 + hash = line[0]
  452 + tag = line[-1]
  453 + image = os.path.join(self.feat_dir, hash[:3], hash[3:] + '.jpg')
  454 + if image:
  455 + im = Jpeg(image, key=sample_key)
  456 + dict_dataset[hash] = (tag, im.getCoefBlocks('Y'))
  457 +
  458 + else:
  459 + with open(self.list_file, 'rb') as tsvfile:
  460 + tsvfile = csv.reader(tsvfile, delimiter='\t')
  461 + for line in tsvfile:
  462 + hash = line[0]
  463 + tag = line[-1]
  464 + path_feat = os.path.join(self.feat_dir, hash[:3], hash[3:] + '.' + feattype)
  465 + if path_feat:
  466 + with open(path_feat, 'rb') as featfile:
  467 + dict_dataset[hash] = (tag, json.loads(featfile.read()))
422 468  
423 469 for tag, feat in dict_dataset.values():
424   - X.append([item for sublist in feat for subsublist in sublist for item in subsublist])
  470 + # X.append([item for sublist in feat for subsublist in sublist for item in subsublist])
  471 + X.append(np.array(feat).ravel().tolist())
425 472 Y.append(int(tag))
426 473  
427   - elif mode == "remote" or mode == "hbase":
  474 + elif mode == "hbase": # remote
428 475 if self.table == None:
429 476 self.table = self.get_table()
430 477  
431 478 col_feat, col_tag = 'cf_feat:' + feattype, 'cf_tag:' + tagtype
432 479 for key, data in self.table.scan(columns=[col_feat, col_tag]):
433   - X.append([item for sublist in json.loads(data[col_feat]) for subsublist in sublist for item in subsublist])
  480 + X.append(
  481 + [item for sublist in json.loads(data[col_feat]) for subsublist in sublist for item in subsublist])
434 482 Y.append(int(data[col_tag]))
435 483  
436   - elif mode == "spark" or mode == "cluster":
  484 + elif mode == "spark": # cluster
437 485 if self.sparker == None:
438 486 self.sparker = SC.Sparker(host='HPC-server', appname='ImageCV', master='spark://HPC-server:7077')
439 487  
... ...
test/test_data.py
... ... @@ -135,7 +135,19 @@ def test_pipeline():
135 135  
136 136  
137 137 def test_crop():
138   - crop.crop_Test()
  138 + # crop.crop_Test()
  139 + dil = ILSVRC.DataILSVRC(base_dir='/data/hadoop/ImageNet/ILSVRC/ILSVRC2013_DET_val', category='Test_1')
  140 + dil.crop()
  141 +
  142 + dil2 = ILSVRC.DataILSVRC(base_dir='/data/hadoop/ImageNet/ILSVRC/ILSVRC2013_DET_val', category='Test_1_crop_pil')
  143 +
  144 + dil.format()
  145 + dil.embed(rate=0.2)
  146 +
  147 + X,Y = dil2.load_data(mode='local',feattype='coef')
  148 + print X[0]
  149 + print Y
  150 +
139 151  
140 152 if __name__ == '__main__':
141 153 # test_MSR()
... ...