__init__.py
2.44 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
# -*- coding: utf-8 -*-
__author__ = 'chunk'
__all__ = ['DataDumperBase']
class DataDumperBase(object):
"""
Base class for image data dumping & retrieving.
A regular directory pattern would be like this:
dst
├── Dev (category)
├── file-tag.tsv (list_file)
│
├── Feat (feat_dir)
│ ├── 0a1
│ └── 53e
│ └── ...
|
└── Img (img_dir)
├── 0a1
└── 53e
└── ...
├── Train (category)
├── file-tag.tsv (list_file)
│
├── Feat
│ ├── 032
│ └── a21
│ └── ...
|
└── Img
├── 032
└── a21
└── ...
.
.
.
It can be refractored from the original pattern which is supposed to be generated from web crawlers:
├── Neg
│ ├── aaa.jpg
│ └── bbb.jpg
│ └── ...
|
└── Pos
├── ccc.jpg
└── ddd.jpg
└── ...
Convention:
'im' or 'img' is for image file data while 'image' or 'image_path' for file path;
"""
def __init__(self, base_dir, category):
"""
base: e.g. '/home/hadoop/data/MSR-IRC2014/'
list_file: not data_file! e.g. 'file-tag.tsv'
dict_data: e.g. {'filename':rawdata} or {'filename':tag}
"""
self.base = base_dir
self.category = category
self.dst_dir = self.base + 'dst/'
if self.category != None:
self.dst_dir += (self.category + '/')
self.list_file = self.dst_dir + 'file-tag.tsv'
self.feat_dir = self.dst_dir + 'Feat/'
self.img_dir = self.dst_dir + 'Img/'
self.table_name = None
self.table = None
self.connection = None
def format(self):
pass
def get_table(self):
pass
def store_img(self):
pass
def store_tag(self, tagtype):
pass
def store_feat(self, feattype):
pass
def get_feat(self, image, feattype):
pass
def extract_feat(self, feattype):
pass
def load_data(self, mode):
pass