__author__ = 'hadoop' import lmdb import re, fileinput, math import numpy as np import os, sys import caffe from caffe.proto import caffe_pb2 caffe_root = '/home/hadoop/Programs/lib/caffe' package_dir = os.path.dirname(os.path.abspath(__file__)) # Command line to check created files: # python -mlmdb stat --env=./Downloads/caffe-master/data/liris-accede/train_score_lmdb/ def read_lmdb(lmdb_name=os.path.join(caffe_root, 'examples/mnist/mnist_train_lmdb')): lmdb_env = lmdb.open(lmdb_name) lmdb_txn = lmdb_env.begin() lmdb_cursor = lmdb_txn.cursor() datum = caffe_pb2.Datum() for key, value in lmdb_cursor: datum.ParseFromString(value) label = datum.label data = caffe.io.datum_to_array(datum) print label, data def _write_lmdb_raw(X, Y, lmdb_name_data='../res/data_lmdb', lmdb_name_label='../res/label_lmdb'): """ X - numpy array of data. Y - numpy array of labels. """ print('writing labels...') # Size of buffer: 1000 elements to reduce memory consumption for idx in range(int(math.ceil(len(Y) / 1000.0))): in_db_label = lmdb.open(lmdb_name_label, map_size=int(1e12)) with in_db_label.begin(write=True) as in_txn: for label_idx, label_ in enumerate(Y[(1000 * idx):(1000 * (idx + 1))]): im_dat = caffe.io.array_to_datum(np.array(label_, dtype=np.uint8).reshape(1, 1, 1)) in_txn.put('{:0>10d}'.format(1000 * idx + label_idx), im_dat.SerializeToString()) print str(1000 * idx + label_idx + 1) + ' / ' + str(len(Y)) in_db_label.close() print('writing image data...') for idx in range(int(math.ceil(len(X) / 1000.0))): in_db_data = lmdb.open(lmdb_name_data, map_size=int(1e12)) with in_db_data.begin(write=True) as in_txn: for in_idx, in_ in enumerate(X[(1000 * idx):(1000 * (idx + 1))]): # im = caffe.io.load_image(in_) im_dat = caffe.io.array_to_datum(np.array(in_, dtype=int).reshape(1, 200, 200)) in_txn.put('{:0>10d}'.format(1000 * idx + in_idx), im_dat.SerializeToString()) print str(1000 * idx + in_idx + 1) + ' / ' + str(len(X)) in_db_data.close() def write_lmdb(X, Y=None, lmdb_name_data='../res/data_lmdb', lmdb_name_label='../res/label_lmdb'): """ X - numpy array of data. Y - numpy array of labels. """ if Y != None: print('writing image data...') for idx in range(int(math.ceil(len(Y) / 1000.0))): in_db_data = lmdb.open(lmdb_name_data, map_size=int(1e12)) with in_db_data.begin(write=True) as in_txn: for in_idx, (in_, label_) in enumerate( zip(X[(1000 * idx):(1000 * (idx + 1))], Y[(1000 * idx):(1000 * (idx + 1))])): # im = caffe.io.load_image(in_) im_dat = caffe.io.array_to_datum(np.array(in_, dtype=int).reshape(1, 200, 200), label_) in_txn.put('{:0>10d}'.format(1000 * idx + in_idx), im_dat.SerializeToString()) print str(1000 * idx + in_idx + 1) + ' / ' + str(len(X)) in_db_data.close() else: assert isinstance(X[0], tuple) print('writing image data...') for idx in range(int(math.ceil(len(X) / 1000.0))): in_db_data = lmdb.open(lmdb_name_data, map_size=int(1e12)) with in_db_data.begin(write=True) as in_txn: for in_idx, (in_, label_) in enumerate(X[(1000 * idx):(1000 * (idx + 1))]): # im = caffe.io.load_image(in_) im_dat = caffe.io.array_to_datum(np.array(in_, dtype=int).reshape(1, 200, 200), label_) in_txn.put('{:0>10d}'.format(1000 * idx + in_idx), im_dat.SerializeToString()) print str(1000 * idx + in_idx + 1) + ' / ' + str(len(X)) in_db_data.close() if __name__ == '__main__': read_lmdb()