cropping.

Chunk
1 parent 2fe06b7f
Showing 2 changed files with 74 additions and 14 deletions Show diff stats
mdata/ILSVRC.py
test/test_data.py
@@ -22,6 +22,7 @@ import numpy as np
 from numpy.random import randn
 import pandas as pd
 from scipy import stats
+import random
 from subprocess import Popen, PIPE, STDOUT
@@ -268,6 +269,39 @@ class DataILSVRC(DataDumperBase):
     def embed(self, rate=None):
         self._embed_inner(rate)
+
+    def crop(self, size=(300, 300)):
+        for path, subdirs, files in os.walk(self.data_dir):
+            for name in files:
+                image = os.path.join(path, name)
+                print image
+
+                W, H = size
+                try:
+                    im = Image.open(image)
+                    w, h = im.size
+                    if w < W or h < H:
+                        continue
+                    left, upper = random.randint(0, w - W), random.randint(0, h - H)
+                    im = im.crop((left, upper, left + W, upper + H))
+                    im.save(os.path.join(self.data_dir + '_crop_pil', name))
+                except Exception as e:
+                    print '[EXCPT]', e
+                    pass
+
+                    # try:
+                    # img = cv2.imread(image, cv2.CV_LOAD_IMAGE_UNCHANGED)
+                    #     h, w = img.shape[:2]
+                    #     if w < 300 or h < 300:
+                    #         continue
+                    #     left, upper = random.randint(0, w - 300), random.randint(0, h - 300)
+                    #     img_crop = img[upper:upper + 300, left:left + 300]
+                    #     cv2.imwrite(os.path.join(base_dir, category + '_crop_cv', name), img_crop)
+                    # except Exception as e:
+                    #     print '[EXCPT]', e
+                    #     pass
+
+
     def get_table(self):
         if self.table != None:
             return self.table
@@ -410,30 +444,44 @@ class DataILSVRC(DataDumperBase):
             dict_dataset = {}
-            with open(self.list_file, 'rb') as tsvfile:
-                tsvfile = csv.reader(tsvfile, delimiter='\t')
-                for line in tsvfile:
-                    hash = line[0]
-                    tag = line[-1]
-                    path_feat = os.path.join(self.feat_dir, hash[:3], hash[3:] + '.' + feattype)
-                    if path_feat:
-                        with open(path_feat, 'rb') as featfile:
-                            dict_dataset[hash] = (tag, json.loads(featfile.read()))
+            if feattype == 'coef':  # raw
+                with open(self.list_file, 'rb') as tsvfile:
+                    tsvfile = csv.reader(tsvfile, delimiter='\t')
+                    for line in tsvfile:
+                        hash = line[0]
+                        tag = line[-1]
+                        image = os.path.join(self.feat_dir, hash[:3], hash[3:] + '.jpg')
+                        if image:
+                            im = Jpeg(image, key=sample_key)
+                            dict_dataset[hash] = (tag, im.getCoefBlocks('Y'))
+
+            else:
+                with open(self.list_file, 'rb') as tsvfile:
+                    tsvfile = csv.reader(tsvfile, delimiter='\t')
+                    for line in tsvfile:
+                        hash = line[0]
+                        tag = line[-1]
+                        path_feat = os.path.join(self.feat_dir, hash[:3], hash[3:] + '.' + feattype)
+                        if path_feat:
+                            with open(path_feat, 'rb') as featfile:
+                                dict_dataset[hash] = (tag, json.loads(featfile.read()))
             for tag, feat in dict_dataset.values():
-                X.append([item for sublist in feat for subsublist in sublist for item in subsublist])
+                # X.append([item for sublist in feat for subsublist in sublist for item in subsublist])
+                X.append(np.array(feat).ravel().tolist())
                 Y.append(int(tag))
-        elif mode == "remote" or mode == "hbase":
+        elif mode == "hbase":  # remote
             if self.table == None:
                 self.table = self.get_table()
             col_feat, col_tag = 'cf_feat:' + feattype, 'cf_tag:' + tagtype
             for key, data in self.table.scan(columns=[col_feat, col_tag]):
-                X.append([item for sublist in json.loads(data[col_feat]) for subsublist in sublist for item in subsublist])
+                X.append(
+                    [item for sublist in json.loads(data[col_feat]) for subsublist in sublist for item in subsublist])
                 Y.append(int(data[col_tag]))
-        elif mode == "spark" or mode == "cluster":
+        elif mode == "spark":  # cluster
             if self.sparker == None:
                 self.sparker = SC.Sparker(host='HPC-server', appname='ImageCV', master='spark://HPC-server:7077')
@@ -135,7 +135,19 @@ def test_pipeline():
 def test_crop():
-    crop.crop_Test()
+    # crop.crop_Test()
+    dil = ILSVRC.DataILSVRC(base_dir='/data/hadoop/ImageNet/ILSVRC/ILSVRC2013_DET_val', category='Test_1')
+    dil.crop()
+
+    dil2 = ILSVRC.DataILSVRC(base_dir='/data/hadoop/ImageNet/ILSVRC/ILSVRC2013_DET_val', category='Test_1_crop_pil')
+
+    dil.format()
+    dil.embed(rate=0.2)
+
+    X,Y = dil2.load_data(mode='local',feattype='coef')
+    print X[0]
+    print Y
+
 if __name__ == '__main__':
     # test_MSR()
	@@ -22,6 +22,7 @@ import numpy as np		@@ -22,6 +22,7 @@ import numpy as np
22	from numpy.random import randn	22	from numpy.random import randn
23	import pandas as pd	23	import pandas as pd
24	from scipy import stats	24	from scipy import stats
		25	+import random
25		26
26	from subprocess import Popen, PIPE, STDOUT	27	from subprocess import Popen, PIPE, STDOUT
27		28
	@@ -268,6 +269,39 @@ class DataILSVRC(DataDumperBase):		@@ -268,6 +269,39 @@ class DataILSVRC(DataDumperBase):
268	def embed(self, rate=None):	269	def embed(self, rate=None):
269	self._embed_inner(rate)	270	self._embed_inner(rate)
270		271
		272	+
		273	+ def crop(self, size=(300, 300)):
		274	+ for path, subdirs, files in os.walk(self.data_dir):
		275	+ for name in files:
		276	+ image = os.path.join(path, name)
		277	+ print image
		278	+
		279	+ W, H = size
		280	+ try:
		281	+ im = Image.open(image)
		282	+ w, h = im.size
		283	+ if w < W or h < H:
		284	+ continue
		285	+ left, upper = random.randint(0, w - W), random.randint(0, h - H)
		286	+ im = im.crop((left, upper, left + W, upper + H))
		287	+ im.save(os.path.join(self.data_dir + '_crop_pil', name))
		288	+ except Exception as e:
		289	+ print '[EXCPT]', e
		290	+ pass
		291	+
		292	+ # try:
		293	+ # img = cv2.imread(image, cv2.CV_LOAD_IMAGE_UNCHANGED)
		294	+ # h, w = img.shape[:2]
		295	+ # if w < 300 or h < 300:
		296	+ # continue
		297	+ # left, upper = random.randint(0, w - 300), random.randint(0, h - 300)
		298	+ # img_crop = img[upper:upper + 300, left:left + 300]
		299	+ # cv2.imwrite(os.path.join(base_dir, category + '_crop_cv', name), img_crop)
		300	+ # except Exception as e:
		301	+ # print '[EXCPT]', e
		302	+ # pass
		303	+
		304	+
271	def get_table(self):	305	def get_table(self):
272	if self.table != None:	306	if self.table != None:
273	return self.table	307	return self.table
	@@ -410,30 +444,44 @@ class DataILSVRC(DataDumperBase):		@@ -410,30 +444,44 @@ class DataILSVRC(DataDumperBase):
410		444
411	dict_dataset = {}	445	dict_dataset = {}
412		446
413	- with open(self.list_file, 'rb') as tsvfile:
414	- tsvfile = csv.reader(tsvfile, delimiter='\t')
415	- for line in tsvfile:
416	- hash = line[0]
417	- tag = line[-1]
418	- path_feat = os.path.join(self.feat_dir, hash[:3], hash[3:] + '.' + feattype)
419	- if path_feat:
420	- with open(path_feat, 'rb') as featfile:
421	- dict_dataset[hash] = (tag, json.loads(featfile.read()))	447	+ if feattype == 'coef': # raw
		448	+ with open(self.list_file, 'rb') as tsvfile:
		449	+ tsvfile = csv.reader(tsvfile, delimiter='\t')
		450	+ for line in tsvfile:
		451	+ hash = line[0]
		452	+ tag = line[-1]
		453	+ image = os.path.join(self.feat_dir, hash[:3], hash[3:] + '.jpg')
		454	+ if image:
		455	+ im = Jpeg(image, key=sample_key)
		456	+ dict_dataset[hash] = (tag, im.getCoefBlocks('Y'))
		457	+
		458	+ else:
		459	+ with open(self.list_file, 'rb') as tsvfile:
		460	+ tsvfile = csv.reader(tsvfile, delimiter='\t')
		461	+ for line in tsvfile:
		462	+ hash = line[0]
		463	+ tag = line[-1]
		464	+ path_feat = os.path.join(self.feat_dir, hash[:3], hash[3:] + '.' + feattype)
		465	+ if path_feat:
		466	+ with open(path_feat, 'rb') as featfile:
		467	+ dict_dataset[hash] = (tag, json.loads(featfile.read()))
422		468
423	for tag, feat in dict_dataset.values():	469	for tag, feat in dict_dataset.values():
424	- X.append([item for sublist in feat for subsublist in sublist for item in subsublist])	470	+ # X.append([item for sublist in feat for subsublist in sublist for item in subsublist])
		471	+ X.append(np.array(feat).ravel().tolist())
425	Y.append(int(tag))	472	Y.append(int(tag))
426		473
427	- elif mode == "remote" or mode == "hbase":	474	+ elif mode == "hbase": # remote
428	if self.table == None:	475	if self.table == None:
429	self.table = self.get_table()	476	self.table = self.get_table()
430		477
431	col_feat, col_tag = 'cf_feat:' + feattype, 'cf_tag:' + tagtype	478	col_feat, col_tag = 'cf_feat:' + feattype, 'cf_tag:' + tagtype
432	for key, data in self.table.scan(columns=[col_feat, col_tag]):	479	for key, data in self.table.scan(columns=[col_feat, col_tag]):
433	- X.append([item for sublist in json.loads(data[col_feat]) for subsublist in sublist for item in subsublist])	480	+ X.append(
		481	+ [item for sublist in json.loads(data[col_feat]) for subsublist in sublist for item in subsublist])
434	Y.append(int(data[col_tag]))	482	Y.append(int(data[col_tag]))
435		483
436	- elif mode == "spark" or mode == "cluster":	484	+ elif mode == "spark": # cluster
437	if self.sparker == None:	485	if self.sparker == None:
438	self.sparker = SC.Sparker(host='HPC-server', appname='ImageCV', master='spark://HPC-server:7077')	486	self.sparker = SC.Sparker(host='HPC-server', appname='ImageCV', master='spark://HPC-server:7077')
439		487
	@@ -135,7 +135,19 @@ def test_pipeline():		@@ -135,7 +135,19 @@ def test_pipeline():
135		135
136		136
137	def test_crop():	137	def test_crop():
138	- crop.crop_Test()	138	+ # crop.crop_Test()
		139	+ dil = ILSVRC.DataILSVRC(base_dir='/data/hadoop/ImageNet/ILSVRC/ILSVRC2013_DET_val', category='Test_1')
		140	+ dil.crop()
		141	+
		142	+ dil2 = ILSVRC.DataILSVRC(base_dir='/data/hadoop/ImageNet/ILSVRC/ILSVRC2013_DET_val', category='Test_1_crop_pil')
		143	+
		144	+ dil.format()
		145	+ dil.embed(rate=0.2)
		146	+
		147	+ X,Y = dil2.load_data(mode='local',feattype='coef')
		148	+ print X[0]
		149	+ print Y
		150	+
139		151
140	if __name__ == '__main__':	152	if __name__ == '__main__':
141	# test_MSR()	153	# test_MSR()