shuffling.

Chunk
1 parent e6be6b61
Showing 3 changed files with 43 additions and 25 deletions Show diff stats
mdata/ILSVRC.py
mmodel/caffe/helper.py
test/test_data.py
@@ -299,10 +299,10 @@ class DataILSVRC(DataDumperBase):
                     # if w < 300 or h < 300:
                     # continue
                     # left, upper = random.randint(0, w - 300), random.randint(0, h - 300)
-                    #     img_crop = img[upper:upper + 300, left:left + 300]
-                    #     cv2.imwrite(os.path.join(base_dir, category + '_crop_cv', name), img_crop)
+                    # img_crop = img[upper:upper + 300, left:left + 300]
+                    # cv2.imwrite(os.path.join(base_dir, category + '_crop_cv', name), img_crop)
                     # except Exception as e:
-                    #     print '[EXCPT]', e
+                    # print '[EXCPT]', e
                     #     pass
@@ -439,7 +439,7 @@ class DataILSVRC(DataDumperBase):
             pass
-    def load_data(self, mode='local', feattype='ibd', tagtype='class'):
+    def load_data(self, mode='local', feattype='ibd', tagtype='class', shuffle=False):
         INDEX = []
         X = []
         Y = []
@@ -461,7 +461,8 @@ class DataILSVRC(DataDumperBase):
                 for tag, feat in dict_dataset.values():
                     feat.ravel()[[i * 200 + j for i in range(0, 200, 8) for j in range(0, 200, 8)]] = 0
-                    # feat = np.bitwise_and(feat, 1)
+                    feat = np.absolute(feat)
+                    feat = np.bitwise_and(feat, 1)
                     X.append(feat.ravel())
                     Y.append(int(tag))
@@ -503,8 +504,13 @@ class DataILSVRC(DataDumperBase):
         else:
             raise Exception("Unknown mode!")
-        return X, Y
+        if shuffle:
+            # shuffling
+            Z = zip(X, Y)
+            np.random.shuffle(Z)
+            return Z
+        return X, Y
@@ -61,23 +61,37 @@ def _write_lmdb_raw(X, Y, lmdb_name_data=&#39;../res/data_lmdb&#39;, lmdb_name_label=&#39;..
         in_db_data.close()
-def write_lmdb(X, Y, lmdb_name_data='../res/data_lmdb', lmdb_name_label='../res/label_lmdb'):
+def write_lmdb(X, Y=None, lmdb_name_data='../res/data_lmdb', lmdb_name_label='../res/label_lmdb'):
     """
     X - numpy array of data.
     Y - numpy array of labels.
     """
-    print('writing image data...')
-    for idx in range(int(math.ceil(len(Y) / 1000.0))):
-        in_db_data = lmdb.open(lmdb_name_data, map_size=int(1e12))
-        with in_db_data.begin(write=True) as in_txn:
-            for in_idx, (in_, label_) in enumerate(
-                    zip(X[(1000 * idx):(1000 * (idx + 1))], Y[(1000 * idx):(1000 * (idx + 1))])):
-                # im = caffe.io.load_image(in_)
-                im_dat = caffe.io.array_to_datum(np.array(in_, dtype=int).reshape(1, 200, 200), label_)
-                in_txn.put('{:0>10d}'.format(1000 * idx + in_idx), im_dat.SerializeToString())
-
-                print str(1000 * idx + in_idx + 1) + ' / ' + str(len(X))
-        in_db_data.close()
+    if Y != None:
+        print('writing image data...')
+        for idx in range(int(math.ceil(len(Y) / 1000.0))):
+            in_db_data = lmdb.open(lmdb_name_data, map_size=int(1e12))
+            with in_db_data.begin(write=True) as in_txn:
+                for in_idx, (in_, label_) in enumerate(
+                        zip(X[(1000 * idx):(1000 * (idx + 1))], Y[(1000 * idx):(1000 * (idx + 1))])):
+                    # im = caffe.io.load_image(in_)
+                    im_dat = caffe.io.array_to_datum(np.array(in_, dtype=int).reshape(1, 200, 200), label_)
+                    in_txn.put('{:0>10d}'.format(1000 * idx + in_idx), im_dat.SerializeToString())
+
+                    print str(1000 * idx + in_idx + 1) + ' / ' + str(len(X))
+            in_db_data.close()
+    else:
+        assert isinstance(X[0], tuple)
+        print('writing image data...')
+        for idx in range(int(math.ceil(len(X) / 1000.0))):
+            in_db_data = lmdb.open(lmdb_name_data, map_size=int(1e12))
+            with in_db_data.begin(write=True) as in_txn:
+                for in_idx, (in_, label_) in enumerate(X[(1000 * idx):(1000 * (idx + 1))]):
+                    # im = caffe.io.load_image(in_)
+                    im_dat = caffe.io.array_to_datum(np.array(in_, dtype=int).reshape(1, 200, 200), label_)
+                    in_txn.put('{:0>10d}'.format(1000 * idx + in_idx), im_dat.SerializeToString())
+
+                    print str(1000 * idx + in_idx + 1) + ' / ' + str(len(X))
+            in_db_data.close()
 if __name__ == '__main__':
@@ -6,6 +6,7 @@ from ..mdata import MSR, CV, ILSVRC, ILSVRC_S, crop
 from ..mmodel.caffe.helper import *
+
 def test_MSR():
     dmsr = MSR.DataMSR()
     # msrd.format()
@@ -164,14 +165,11 @@ def test_caffe():
     # return
     dil = ILSVRC.DataILSVRC(base_dir='/data/hadoop/ImageNet/ILSVRC/ILSVRC2013_DET_val', category='Train_5000_crop_pil')
-    X, Y = dil.load_data(mode='local', feattype='coef')
+    X = dil.load_data(mode='local', feattype='coef', shuffle=True)
     print X[0]
-    print Y
-    print np.array(X).shape, np.array(Y).shape
-
-    write_lmdb(X[2000:3000],Y[2000:3000])
-
+    print np.array(X).shape
+    write_lmdb(X[7000:])
 if __name__ == '__main__':
	@@ -61,23 +61,37 @@ def _write_lmdb_raw(X, Y, lmdb_name_data='../res/data_lmdb', lmdb_name_label='..		@@ -61,23 +61,37 @@ def _write_lmdb_raw(X, Y, lmdb_name_data='../res/data_lmdb', lmdb_name_label='..
61	in_db_data.close()	61	in_db_data.close()
62		62
63		63
64	-def write_lmdb(X, Y, lmdb_name_data='../res/data_lmdb', lmdb_name_label='../res/label_lmdb'):	64	+def write_lmdb(X, Y=None, lmdb_name_data='../res/data_lmdb', lmdb_name_label='../res/label_lmdb'):
65	"""	65	"""
66	X - numpy array of data.	66	X - numpy array of data.
67	Y - numpy array of labels.	67	Y - numpy array of labels.
68	"""	68	"""
69	- print('writing image data...')
70	- for idx in range(int(math.ceil(len(Y) / 1000.0))):
71	- in_db_data = lmdb.open(lmdb_name_data, map_size=int(1e12))
72	- with in_db_data.begin(write=True) as in_txn:
73	- for in_idx, (in_, label_) in enumerate(
74	- zip(X[(1000 * idx):(1000 * (idx + 1))], Y[(1000 * idx):(1000 * (idx + 1))])):
75	- # im = caffe.io.load_image(in_)
76	- im_dat = caffe.io.array_to_datum(np.array(in_, dtype=int).reshape(1, 200, 200), label_)
77	- in_txn.put('{:0>10d}'.format(1000 * idx + in_idx), im_dat.SerializeToString())
78	-
79	- print str(1000 * idx + in_idx + 1) + ' / ' + str(len(X))
80	- in_db_data.close()	69	+ if Y != None:
		70	+ print('writing image data...')
		71	+ for idx in range(int(math.ceil(len(Y) / 1000.0))):
		72	+ in_db_data = lmdb.open(lmdb_name_data, map_size=int(1e12))
		73	+ with in_db_data.begin(write=True) as in_txn:
		74	+ for in_idx, (in_, label_) in enumerate(
		75	+ zip(X[(1000 * idx):(1000 * (idx + 1))], Y[(1000 * idx):(1000 * (idx + 1))])):
		76	+ # im = caffe.io.load_image(in_)
		77	+ im_dat = caffe.io.array_to_datum(np.array(in_, dtype=int).reshape(1, 200, 200), label_)
		78	+ in_txn.put('{:0>10d}'.format(1000 * idx + in_idx), im_dat.SerializeToString())
		79	+
		80	+ print str(1000 * idx + in_idx + 1) + ' / ' + str(len(X))
		81	+ in_db_data.close()
		82	+ else:
		83	+ assert isinstance(X[0], tuple)
		84	+ print('writing image data...')
		85	+ for idx in range(int(math.ceil(len(X) / 1000.0))):
		86	+ in_db_data = lmdb.open(lmdb_name_data, map_size=int(1e12))
		87	+ with in_db_data.begin(write=True) as in_txn:
		88	+ for in_idx, (in_, label_) in enumerate(X[(1000 * idx):(1000 * (idx + 1))]):
		89	+ # im = caffe.io.load_image(in_)
		90	+ im_dat = caffe.io.array_to_datum(np.array(in_, dtype=int).reshape(1, 200, 200), label_)
		91	+ in_txn.put('{:0>10d}'.format(1000 * idx + in_idx), im_dat.SerializeToString())
		92	+
		93	+ print str(1000 * idx + in_idx + 1) + ' / ' + str(len(X))
		94	+ in_db_data.close()
81		95
82		96
83	if __name__ == '__main__':	97	if __name__ == '__main__':