shuffling.

Chunk
1 parent e6be6b61
Showing 3 changed files with 43 additions and 25 deletions Show diff stats
mdata/ILSVRC.py
mmodel/caffe/helper.py
test/test_data.py
@@ -299,10 +299,10 @@ class DataILSVRC(DataDumperBase):
                     # if w < 300 or h < 300:
                     # continue
                     # left, upper = random.randint(0, w - 300), random.randint(0, h - 300)
-                    #     img_crop = img[upper:upper + 300, left:left + 300]
-                    #     cv2.imwrite(os.path.join(base_dir, category + '_crop_cv', name), img_crop)
+                    # img_crop = img[upper:upper + 300, left:left + 300]
+                    # cv2.imwrite(os.path.join(base_dir, category + '_crop_cv', name), img_crop)
                     # except Exception as e:
-                    #     print '[EXCPT]', e
+                    # print '[EXCPT]', e
                     #     pass
  
  
@@ -439,7 +439,7 @@ class DataILSVRC(DataDumperBase):
             pass
  
  
-    def load_data(self, mode='local', feattype='ibd', tagtype='class'):
+    def load_data(self, mode='local', feattype='ibd', tagtype='class', shuffle=False):
         INDEX = []
         X = []
         Y = []
@@ -461,7 +461,8 @@ class DataILSVRC(DataDumperBase):
  
                 for tag, feat in dict_dataset.values():
                     feat.ravel()[[i * 200 + j for i in range(0, 200, 8) for j in range(0, 200, 8)]] = 0
-                    # feat = np.bitwise_and(feat, 1)
+                    feat = np.absolute(feat)
+                    feat = np.bitwise_and(feat, 1)
                     X.append(feat.ravel())
                     Y.append(int(tag))
  
@@ -503,8 +504,13 @@ class DataILSVRC(DataDumperBase):
         else:
             raise Exception("Unknown mode!")
  
-        return X, Y
+        if shuffle:
+            # shuffling
+            Z = zip(X, Y)
+            np.random.shuffle(Z)
+            return Z
  
+        return X, Y
  
  
  
@@ -61,23 +61,37 @@ def _write_lmdb_raw(X, Y, lmdb_name_data=&#39;../res/data_lmdb&#39;, lmdb_name_label=&#39;..
         in_db_data.close()
  
  
-def write_lmdb(X, Y, lmdb_name_data='../res/data_lmdb', lmdb_name_label='../res/label_lmdb'):
+def write_lmdb(X, Y=None, lmdb_name_data='../res/data_lmdb', lmdb_name_label='../res/label_lmdb'):
     """
     X - numpy array of data.
     Y - numpy array of labels.
     """
-    print('writing image data...')
-    for idx in range(int(math.ceil(len(Y) / 1000.0))):
-        in_db_data = lmdb.open(lmdb_name_data, map_size=int(1e12))
-        with in_db_data.begin(write=True) as in_txn:
-            for in_idx, (in_, label_) in enumerate(
-                    zip(X[(1000 * idx):(1000 * (idx + 1))], Y[(1000 * idx):(1000 * (idx + 1))])):
-                # im = caffe.io.load_image(in_)
-                im_dat = caffe.io.array_to_datum(np.array(in_, dtype=int).reshape(1, 200, 200), label_)
-                in_txn.put('{:0>10d}'.format(1000 * idx + in_idx), im_dat.SerializeToString())
-
-                print str(1000 * idx + in_idx + 1) + ' / ' + str(len(X))
-        in_db_data.close()
+    if Y != None:
+        print('writing image data...')
+        for idx in range(int(math.ceil(len(Y) / 1000.0))):
+            in_db_data = lmdb.open(lmdb_name_data, map_size=int(1e12))
+            with in_db_data.begin(write=True) as in_txn:
+                for in_idx, (in_, label_) in enumerate(
+                        zip(X[(1000 * idx):(1000 * (idx + 1))], Y[(1000 * idx):(1000 * (idx + 1))])):
+                    # im = caffe.io.load_image(in_)
+                    im_dat = caffe.io.array_to_datum(np.array(in_, dtype=int).reshape(1, 200, 200), label_)
+                    in_txn.put('{:0>10d}'.format(1000 * idx + in_idx), im_dat.SerializeToString())
+
+                    print str(1000 * idx + in_idx + 1) + ' / ' + str(len(X))
+            in_db_data.close()
+    else:
+        assert isinstance(X[0], tuple)
+        print('writing image data...')
+        for idx in range(int(math.ceil(len(X) / 1000.0))):
+            in_db_data = lmdb.open(lmdb_name_data, map_size=int(1e12))
+            with in_db_data.begin(write=True) as in_txn:
+                for in_idx, (in_, label_) in enumerate(X[(1000 * idx):(1000 * (idx + 1))]):
+                    # im = caffe.io.load_image(in_)
+                    im_dat = caffe.io.array_to_datum(np.array(in_, dtype=int).reshape(1, 200, 200), label_)
+                    in_txn.put('{:0>10d}'.format(1000 * idx + in_idx), im_dat.SerializeToString())
+
+                    print str(1000 * idx + in_idx + 1) + ' / ' + str(len(X))
+            in_db_data.close()
  
  
 if __name__ == '__main__':
@@ -6,6 +6,7 @@ from ..mdata import MSR, CV, ILSVRC, ILSVRC_S, crop
  
 from ..mmodel.caffe.helper import *
  
+
 def test_MSR():
     dmsr = MSR.DataMSR()
     # msrd.format()
@@ -164,14 +165,11 @@ def test_caffe():
     # return
  
     dil = ILSVRC.DataILSVRC(base_dir='/data/hadoop/ImageNet/ILSVRC/ILSVRC2013_DET_val', category='Train_5000_crop_pil')
-    X, Y = dil.load_data(mode='local', feattype='coef')
+    X = dil.load_data(mode='local', feattype='coef', shuffle=True)
     print X[0]
-    print Y
-    print np.array(X).shape, np.array(Y).shape
-
-    write_lmdb(X[2000:3000],Y[2000:3000])
-
+    print np.array(X).shape
  
+    write_lmdb(X[7000:])
  
  
 if __name__ == '__main__':
...	...	@@ -299,10 +299,10 @@ class DataILSVRC(DataDumperBase):
299	299	# if w < 300 or h < 300:
300	300	# continue
301	301	# left, upper = random.randint(0, w - 300), random.randint(0, h - 300)
302		- # img_crop = img[upper:upper + 300, left:left + 300]
303		- # cv2.imwrite(os.path.join(base_dir, category + '_crop_cv', name), img_crop)
	302	+ # img_crop = img[upper:upper + 300, left:left + 300]
	303	+ # cv2.imwrite(os.path.join(base_dir, category + '_crop_cv', name), img_crop)
304	304	# except Exception as e:
305		- # print '[EXCPT]', e
	305	+ # print '[EXCPT]', e
306	306	# pass
307	307
308	308
...	...	@@ -439,7 +439,7 @@ class DataILSVRC(DataDumperBase):
439	439	pass
440	440
441	441
442		- def load_data(self, mode='local', feattype='ibd', tagtype='class'):
	442	+ def load_data(self, mode='local', feattype='ibd', tagtype='class', shuffle=False):
443	443	INDEX = []
444	444	X = []
445	445	Y = []
...	...	@@ -461,7 +461,8 @@ class DataILSVRC(DataDumperBase):
461	461
462	462	for tag, feat in dict_dataset.values():
463	463	feat.ravel()[[i * 200 + j for i in range(0, 200, 8) for j in range(0, 200, 8)]] = 0
464		- # feat = np.bitwise_and(feat, 1)
	464	+ feat = np.absolute(feat)
	465	+ feat = np.bitwise_and(feat, 1)
465	466	X.append(feat.ravel())
466	467	Y.append(int(tag))
467	468
...	...	@@ -503,8 +504,13 @@ class DataILSVRC(DataDumperBase):
503	504	else:
504	505	raise Exception("Unknown mode!")
505	506
506		- return X, Y
	507	+ if shuffle:
	508	+ # shuffling
	509	+ Z = zip(X, Y)
	510	+ np.random.shuffle(Z)
	511	+ return Z
507	512
	513	+ return X, Y
508	514
509	515
510	516
...	...
...	...	@@ -61,23 +61,37 @@ def _write_lmdb_raw(X, Y, lmdb_name_data='../res/data_lmdb', lmdb_name_label='..
61	61	in_db_data.close()
62	62
63	63
64		-def write_lmdb(X, Y, lmdb_name_data='../res/data_lmdb', lmdb_name_label='../res/label_lmdb'):
	64	+def write_lmdb(X, Y=None, lmdb_name_data='../res/data_lmdb', lmdb_name_label='../res/label_lmdb'):
65	65	"""
66	66	X - numpy array of data.
67	67	Y - numpy array of labels.
68	68	"""
69		- print('writing image data...')
70		- for idx in range(int(math.ceil(len(Y) / 1000.0))):
71		- in_db_data = lmdb.open(lmdb_name_data, map_size=int(1e12))
72		- with in_db_data.begin(write=True) as in_txn:
73		- for in_idx, (in_, label_) in enumerate(
74		- zip(X[(1000 * idx):(1000 * (idx + 1))], Y[(1000 * idx):(1000 * (idx + 1))])):
75		- # im = caffe.io.load_image(in_)
76		- im_dat = caffe.io.array_to_datum(np.array(in_, dtype=int).reshape(1, 200, 200), label_)
77		- in_txn.put('{:0>10d}'.format(1000 * idx + in_idx), im_dat.SerializeToString())
78		-
79		- print str(1000 * idx + in_idx + 1) + ' / ' + str(len(X))
80		- in_db_data.close()
	69	+ if Y != None:
	70	+ print('writing image data...')
	71	+ for idx in range(int(math.ceil(len(Y) / 1000.0))):
	72	+ in_db_data = lmdb.open(lmdb_name_data, map_size=int(1e12))
	73	+ with in_db_data.begin(write=True) as in_txn:
	74	+ for in_idx, (in_, label_) in enumerate(
	75	+ zip(X[(1000 * idx):(1000 * (idx + 1))], Y[(1000 * idx):(1000 * (idx + 1))])):
	76	+ # im = caffe.io.load_image(in_)
	77	+ im_dat = caffe.io.array_to_datum(np.array(in_, dtype=int).reshape(1, 200, 200), label_)
	78	+ in_txn.put('{:0>10d}'.format(1000 * idx + in_idx), im_dat.SerializeToString())
	79	+
	80	+ print str(1000 * idx + in_idx + 1) + ' / ' + str(len(X))
	81	+ in_db_data.close()
	82	+ else:
	83	+ assert isinstance(X[0], tuple)
	84	+ print('writing image data...')
	85	+ for idx in range(int(math.ceil(len(X) / 1000.0))):
	86	+ in_db_data = lmdb.open(lmdb_name_data, map_size=int(1e12))
	87	+ with in_db_data.begin(write=True) as in_txn:
	88	+ for in_idx, (in_, label_) in enumerate(X[(1000 * idx):(1000 * (idx + 1))]):
	89	+ # im = caffe.io.load_image(in_)
	90	+ im_dat = caffe.io.array_to_datum(np.array(in_, dtype=int).reshape(1, 200, 200), label_)
	91	+ in_txn.put('{:0>10d}'.format(1000 * idx + in_idx), im_dat.SerializeToString())
	92	+
	93	+ print str(1000 * idx + in_idx + 1) + ' / ' + str(len(X))
	94	+ in_db_data.close()
81	95
82	96
83	97	if __name__ == '__main__':
...	...
...	...	@@ -6,6 +6,7 @@ from ..mdata import MSR, CV, ILSVRC, ILSVRC_S, crop
6	6
7	7	from ..mmodel.caffe.helper import *
8	8
	9	+
9	10	def test_MSR():
10	11	dmsr = MSR.DataMSR()
11	12	# msrd.format()
...	...	@@ -164,14 +165,11 @@ def test_caffe():
164	165	# return
165	166
166	167	dil = ILSVRC.DataILSVRC(base_dir='/data/hadoop/ImageNet/ILSVRC/ILSVRC2013_DET_val', category='Train_5000_crop_pil')
167		- X, Y = dil.load_data(mode='local', feattype='coef')
	168	+ X = dil.load_data(mode='local', feattype='coef', shuffle=True)
168	169	print X[0]
169		- print Y
170		- print np.array(X).shape, np.array(Y).shape
171		-
172		- write_lmdb(X[2000:3000],Y[2000:3000])
173		-
	170	+ print np.array(X).shape
174	171
	172	+ write_lmdb(X[7000:])
175	173
176	174
177	175	if __name__ == '__main__':
...	...