cropping.

Chunk
1 parent 2fe06b7f
Showing 2 changed files with 74 additions and 14 deletions Show diff stats
mdata/ILSVRC.py
test/test_data.py
@@ -22,6 +22,7 @@ import numpy as np
 from numpy.random import randn
 import pandas as pd
 from scipy import stats
+import random
  
 from subprocess import Popen, PIPE, STDOUT
  
@@ -268,6 +269,39 @@ class DataILSVRC(DataDumperBase):
     def embed(self, rate=None):
         self._embed_inner(rate)
  
+
+    def crop(self, size=(300, 300)):
+        for path, subdirs, files in os.walk(self.data_dir):
+            for name in files:
+                image = os.path.join(path, name)
+                print image
+
+                W, H = size
+                try:
+                    im = Image.open(image)
+                    w, h = im.size
+                    if w < W or h < H:
+                        continue
+                    left, upper = random.randint(0, w - W), random.randint(0, h - H)
+                    im = im.crop((left, upper, left + W, upper + H))
+                    im.save(os.path.join(self.data_dir + '_crop_pil', name))
+                except Exception as e:
+                    print '[EXCPT]', e
+                    pass
+
+                    # try:
+                    # img = cv2.imread(image, cv2.CV_LOAD_IMAGE_UNCHANGED)
+                    #     h, w = img.shape[:2]
+                    #     if w < 300 or h < 300:
+                    #         continue
+                    #     left, upper = random.randint(0, w - 300), random.randint(0, h - 300)
+                    #     img_crop = img[upper:upper + 300, left:left + 300]
+                    #     cv2.imwrite(os.path.join(base_dir, category + '_crop_cv', name), img_crop)
+                    # except Exception as e:
+                    #     print '[EXCPT]', e
+                    #     pass
+
+
     def get_table(self):
         if self.table != None:
             return self.table
@@ -410,30 +444,44 @@ class DataILSVRC(DataDumperBase):
  
             dict_dataset = {}
  
-            with open(self.list_file, 'rb') as tsvfile:
-                tsvfile = csv.reader(tsvfile, delimiter='\t')
-                for line in tsvfile:
-                    hash = line[0]
-                    tag = line[-1]
-                    path_feat = os.path.join(self.feat_dir, hash[:3], hash[3:] + '.' + feattype)
-                    if path_feat:
-                        with open(path_feat, 'rb') as featfile:
-                            dict_dataset[hash] = (tag, json.loads(featfile.read()))
+            if feattype == 'coef':  # raw
+                with open(self.list_file, 'rb') as tsvfile:
+                    tsvfile = csv.reader(tsvfile, delimiter='\t')
+                    for line in tsvfile:
+                        hash = line[0]
+                        tag = line[-1]
+                        image = os.path.join(self.feat_dir, hash[:3], hash[3:] + '.jpg')
+                        if image:
+                            im = Jpeg(image, key=sample_key)
+                            dict_dataset[hash] = (tag, im.getCoefBlocks('Y'))
+
+            else:
+                with open(self.list_file, 'rb') as tsvfile:
+                    tsvfile = csv.reader(tsvfile, delimiter='\t')
+                    for line in tsvfile:
+                        hash = line[0]
+                        tag = line[-1]
+                        path_feat = os.path.join(self.feat_dir, hash[:3], hash[3:] + '.' + feattype)
+                        if path_feat:
+                            with open(path_feat, 'rb') as featfile:
+                                dict_dataset[hash] = (tag, json.loads(featfile.read()))
  
             for tag, feat in dict_dataset.values():
-                X.append([item for sublist in feat for subsublist in sublist for item in subsublist])
+                # X.append([item for sublist in feat for subsublist in sublist for item in subsublist])
+                X.append(np.array(feat).ravel().tolist())
                 Y.append(int(tag))
  
-        elif mode == "remote" or mode == "hbase":
+        elif mode == "hbase":  # remote
             if self.table == None:
                 self.table = self.get_table()
  
             col_feat, col_tag = 'cf_feat:' + feattype, 'cf_tag:' + tagtype
             for key, data in self.table.scan(columns=[col_feat, col_tag]):
-                X.append([item for sublist in json.loads(data[col_feat]) for subsublist in sublist for item in subsublist])
+                X.append(
+                    [item for sublist in json.loads(data[col_feat]) for subsublist in sublist for item in subsublist])
                 Y.append(int(data[col_tag]))
  
-        elif mode == "spark" or mode == "cluster":
+        elif mode == "spark":  # cluster
             if self.sparker == None:
                 self.sparker = SC.Sparker(host='HPC-server', appname='ImageCV', master='spark://HPC-server:7077')
  
@@ -135,7 +135,19 @@ def test_pipeline():
  
  
 def test_crop():
-    crop.crop_Test()
+    # crop.crop_Test()
+    dil = ILSVRC.DataILSVRC(base_dir='/data/hadoop/ImageNet/ILSVRC/ILSVRC2013_DET_val', category='Test_1')
+    dil.crop()
+
+    dil2 = ILSVRC.DataILSVRC(base_dir='/data/hadoop/ImageNet/ILSVRC/ILSVRC2013_DET_val', category='Test_1_crop_pil')
+
+    dil.format()
+    dil.embed(rate=0.2)
+
+    X,Y = dil2.load_data(mode='local',feattype='coef')
+    print X[0]
+    print Y
+
  
 if __name__ == '__main__':
     # test_MSR()
...	...	@@ -22,6 +22,7 @@ import numpy as np
22	22	from numpy.random import randn
23	23	import pandas as pd
24	24	from scipy import stats
	25	+import random
25	26
26	27	from subprocess import Popen, PIPE, STDOUT
27	28
...	...	@@ -268,6 +269,39 @@ class DataILSVRC(DataDumperBase):
268	269	def embed(self, rate=None):
269	270	self._embed_inner(rate)
270	271
	272	+
	273	+ def crop(self, size=(300, 300)):
	274	+ for path, subdirs, files in os.walk(self.data_dir):
	275	+ for name in files:
	276	+ image = os.path.join(path, name)
	277	+ print image
	278	+
	279	+ W, H = size
	280	+ try:
	281	+ im = Image.open(image)
	282	+ w, h = im.size
	283	+ if w < W or h < H:
	284	+ continue
	285	+ left, upper = random.randint(0, w - W), random.randint(0, h - H)
	286	+ im = im.crop((left, upper, left + W, upper + H))
	287	+ im.save(os.path.join(self.data_dir + '_crop_pil', name))
	288	+ except Exception as e:
	289	+ print '[EXCPT]', e
	290	+ pass
	291	+
	292	+ # try:
	293	+ # img = cv2.imread(image, cv2.CV_LOAD_IMAGE_UNCHANGED)
	294	+ # h, w = img.shape[:2]
	295	+ # if w < 300 or h < 300:
	296	+ # continue
	297	+ # left, upper = random.randint(0, w - 300), random.randint(0, h - 300)
	298	+ # img_crop = img[upper:upper + 300, left:left + 300]
	299	+ # cv2.imwrite(os.path.join(base_dir, category + '_crop_cv', name), img_crop)
	300	+ # except Exception as e:
	301	+ # print '[EXCPT]', e
	302	+ # pass
	303	+
	304	+
271	305	def get_table(self):
272	306	if self.table != None:
273	307	return self.table
...	...	@@ -410,30 +444,44 @@ class DataILSVRC(DataDumperBase):
410	444
411	445	dict_dataset = {}
412	446
413		- with open(self.list_file, 'rb') as tsvfile:
414		- tsvfile = csv.reader(tsvfile, delimiter='\t')
415		- for line in tsvfile:
416		- hash = line[0]
417		- tag = line[-1]
418		- path_feat = os.path.join(self.feat_dir, hash[:3], hash[3:] + '.' + feattype)
419		- if path_feat:
420		- with open(path_feat, 'rb') as featfile:
421		- dict_dataset[hash] = (tag, json.loads(featfile.read()))
	447	+ if feattype == 'coef': # raw
	448	+ with open(self.list_file, 'rb') as tsvfile:
	449	+ tsvfile = csv.reader(tsvfile, delimiter='\t')
	450	+ for line in tsvfile:
	451	+ hash = line[0]
	452	+ tag = line[-1]
	453	+ image = os.path.join(self.feat_dir, hash[:3], hash[3:] + '.jpg')
	454	+ if image:
	455	+ im = Jpeg(image, key=sample_key)
	456	+ dict_dataset[hash] = (tag, im.getCoefBlocks('Y'))
	457	+
	458	+ else:
	459	+ with open(self.list_file, 'rb') as tsvfile:
	460	+ tsvfile = csv.reader(tsvfile, delimiter='\t')
	461	+ for line in tsvfile:
	462	+ hash = line[0]
	463	+ tag = line[-1]
	464	+ path_feat = os.path.join(self.feat_dir, hash[:3], hash[3:] + '.' + feattype)
	465	+ if path_feat:
	466	+ with open(path_feat, 'rb') as featfile:
	467	+ dict_dataset[hash] = (tag, json.loads(featfile.read()))
422	468
423	469	for tag, feat in dict_dataset.values():
424		- X.append([item for sublist in feat for subsublist in sublist for item in subsublist])
	470	+ # X.append([item for sublist in feat for subsublist in sublist for item in subsublist])
	471	+ X.append(np.array(feat).ravel().tolist())
425	472	Y.append(int(tag))
426	473
427		- elif mode == "remote" or mode == "hbase":
	474	+ elif mode == "hbase": # remote
428	475	if self.table == None:
429	476	self.table = self.get_table()
430	477
431	478	col_feat, col_tag = 'cf_feat:' + feattype, 'cf_tag:' + tagtype
432	479	for key, data in self.table.scan(columns=[col_feat, col_tag]):
433		- X.append([item for sublist in json.loads(data[col_feat]) for subsublist in sublist for item in subsublist])
	480	+ X.append(
	481	+ [item for sublist in json.loads(data[col_feat]) for subsublist in sublist for item in subsublist])
434	482	Y.append(int(data[col_tag]))
435	483
436		- elif mode == "spark" or mode == "cluster":
	484	+ elif mode == "spark": # cluster
437	485	if self.sparker == None:
438	486	self.sparker = SC.Sparker(host='HPC-server', appname='ImageCV', master='spark://HPC-server:7077')
439	487
...	...
...	...	@@ -135,7 +135,19 @@ def test_pipeline():
135	135
136	136
137	137	def test_crop():
138		- crop.crop_Test()
	138	+ # crop.crop_Test()
	139	+ dil = ILSVRC.DataILSVRC(base_dir='/data/hadoop/ImageNet/ILSVRC/ILSVRC2013_DET_val', category='Test_1')
	140	+ dil.crop()
	141	+
	142	+ dil2 = ILSVRC.DataILSVRC(base_dir='/data/hadoop/ImageNet/ILSVRC/ILSVRC2013_DET_val', category='Test_1_crop_pil')
	143	+
	144	+ dil.format()
	145	+ dil.embed(rate=0.2)
	146	+
	147	+ X,Y = dil2.load_data(mode='local',feattype='coef')
	148	+ print X[0]
	149	+ print Y
	150	+
139	151
140	152	if __name__ == '__main__':
141	153	# test_MSR()
...	...