Blame view

mmodel/caffe/helper.py 3.09 KB
84648488   Chunk   reverted.
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
__author__ = 'hadoop'

import lmdb
import re, fileinput, math
import numpy as np

import os, sys
import caffe
from caffe.proto import caffe_pb2

caffe_root = '/home/hadoop/Programs/lib/caffe'
package_dir = os.path.dirname(os.path.abspath(__file__))

# Command line to check created files:
# python -mlmdb stat --env=./Downloads/caffe-master/data/liris-accede/train_score_lmdb/


def read_lmdb(lmdb_name=os.path.join(caffe_root, 'examples/mnist/mnist_train_lmdb')):
    lmdb_env = lmdb.open(lmdb_name)
    lmdb_txn = lmdb_env.begin()
    lmdb_cursor = lmdb_txn.cursor()
    datum = caffe_pb2.Datum()

    for key, value in lmdb_cursor:
        datum.ParseFromString(value)
        label = datum.label
        data = caffe.io.datum_to_array(datum)
        print label, data


def _write_lmdb_raw(X, Y, lmdb_name_data='../res/data_lmdb', lmdb_name_label='../res/label_lmdb'):
    """
    X - numpy array of data.
    Y - numpy array of labels.
    """

    print('writing labels...')

    # Size of buffer: 1000 elements to reduce memory consumption
    for idx in range(int(math.ceil(len(Y) / 1000.0))):
        in_db_label = lmdb.open(lmdb_name_label, map_size=int(1e12))
        with in_db_label.begin(write=True) as in_txn:
            for label_idx, label_ in enumerate(Y[(1000 * idx):(1000 * (idx + 1))]):
                im_dat = caffe.io.array_to_datum(np.array(label_, dtype=np.uint8).reshape(1, 1, 1))
                in_txn.put('{:0>10d}'.format(1000 * idx + label_idx), im_dat.SerializeToString())

                print str(1000 * idx + label_idx + 1) + ' / ' + str(len(Y))
        in_db_label.close()

    print('writing image data...')

    for idx in range(int(math.ceil(len(X) / 1000.0))):
        in_db_data = lmdb.open(lmdb_name_data, map_size=int(1e12))
        with in_db_data.begin(write=True) as in_txn:
            for in_idx, in_ in enumerate(X[(1000 * idx):(1000 * (idx + 1))]):
                # im = caffe.io.load_image(in_)
                im_dat = caffe.io.array_to_datum(np.array(in_, dtype=int).reshape(1, 200, 200))
                in_txn.put('{:0>10d}'.format(1000 * idx + in_idx), im_dat.SerializeToString())

                print str(1000 * idx + in_idx + 1) + ' / ' + str(len(X))
        in_db_data.close()


def write_lmdb(X, Y, lmdb_name_data='../res/data_lmdb', lmdb_name_label='../res/label_lmdb'):
    """
    X - numpy array of data.
    Y - numpy array of labels.
    """
    print('writing image data...')
    for idx in range(int(math.ceil(len(Y) / 1000.0))):
        in_db_data = lmdb.open(lmdb_name_data, map_size=int(1e12))
        with in_db_data.begin(write=True) as in_txn:
            for in_idx, (in_, label_) in enumerate(
                    zip(X[(1000 * idx):(1000 * (idx + 1))], Y[(1000 * idx):(1000 * (idx + 1))])):
                # im = caffe.io.load_image(in_)
                im_dat = caffe.io.array_to_datum(np.array(in_, dtype=int).reshape(1, 200, 200), label_)
                in_txn.put('{:0>10d}'.format(1000 * idx + in_idx), im_dat.SerializeToString())

                print str(1000 * idx + in_idx + 1) + ' / ' + str(len(X))
        in_db_data.close()


if __name__ == '__main__':
    read_lmdb()