Commit 92d488d8731a722a81487a5150ce7775941940ee

Authored by Chunk
1 parent 7e755616
Exists in master

.

Showing 1 changed file with 380 additions and 0 deletions   Show diff stats
mdata/ANALYSIS.py 0 → 100644
... ... @@ -0,0 +1,380 @@
  1 +__author__ = 'chunk'
  2 +
  3 +import os
  4 +import numpy as np
  5 +from numpy.random import randn
  6 +import pandas as pd
  7 +from scipy import stats
  8 +import matplotlib as mpl
  9 +import matplotlib.pyplot as plt
  10 +import seaborn as sns
  11 +
  12 +import numpy as np
  13 +import matplotlib.pyplot as plt
  14 +import seaborn as sns
  15 +from .. import mjpeg
  16 +from ..mjpeg import base
  17 +from ..msteg.steganography import LSB, F3, F4, F5
  18 +
  19 +np.random.seed(sum(map(ord, "whoami")))
  20 +
  21 +sample_key = [46812L, 20559L, 31360L, 16681L, 27536L, 39553L, 5427L, 63029L, 56572L, 36476L, 25695L,
  22 + 61908L, 63014L, 5908L, 59816L, 56765L]
  23 +
  24 +# plt.ticklabel_format(style='sci', axis='both', scilimits=(0, 0))
  25 +plt.ticklabel_format(style='sci', axis='both')
  26 +
  27 +package_dir = os.path.dirname(os.path.abspath(__file__))
  28 +
  29 +
  30 +def anal_ILSVRC():
  31 + df_ILS = pd.read_csv('../res/file-tag.tsv',
  32 + names=['hash', 'width', 'height', 'size', 'quality'], sep='\t')
  33 + print df_ILS[df_ILS.size < 2000000]
  34 + print df_ILS.describe()
  35 + # df_ILS.boxplot(column='size')
  36 + # plt.show()
  37 +
  38 + length = df_ILS.shape[0]
  39 +
  40 + # print type(df_ILS.size.order()) # <class 'pandas.core.series.Series'>
  41 + print df_ILS.size.order().iloc[map(lambda x: x * length, [1.0 / 3, 2.0 / 3, 0.9999])]
  42 + """
  43 + 7082 108514
  44 + 3826 150389
  45 + 8761 4814541
  46 + """
  47 +
  48 + print df_ILS.size[df_ILS.size <= 102400].count()
  49 + print df_ILS.size[(df_ILS['size'] > 102400) & (df_ILS['size'] <= 153600)].count()
  50 + print df_ILS.size[df_ILS.size > 153600].count()
  51 +
  52 + """
  53 + (-,100K,150K,+):
  54 + 4519
  55 + 6163
  56 + 4831
  57 + (-,100K,500K,+):
  58 + 4519
  59 + 10932
  60 + 62
  61 + """
  62 +
  63 + ## Quality
  64 + print df_ILS.quality.order().iloc[map(lambda x: x * length, [1.0 / 3, 2.0 / 3, 0.9999])]
  65 + """
  66 + 13507 96
  67 + 831 96
  68 + 6529 100
  69 + """
  70 + df_new = df_ILS.sort(['size', 'quality'], ascending=True)
  71 + print df_new
  72 +
  73 + rand_class = stats.bernoulli.rvs(0.3, size=length)
  74 + # df_new['class'] = pd.Series(rand_class, index=df_new.index)
  75 + df_new['class'] = rand_class
  76 +
  77 + print rand_class[:100]
  78 + print df_new
  79 +
  80 + df_new.to_csv('../res/test.tsv', header=False, index=False, sep='\t')
  81 +
  82 +
  83 +def anal_ILSVRC_Test():
  84 + df_ILS_T = pd.read_csv('../res/file-tag-test.tsv',
  85 + names=['hash', 'width', 'height', 'size', 'quality', 'class'], sep='\t')
  86 + print df_ILS_T
  87 + print df_ILS_T.size.describe()
  88 +
  89 + print df_ILS_T.size[df_ILS_T.size <= 102400].count()
  90 + print df_ILS_T.size[(df_ILS_T['size'] > 102400) & (df_ILS_T['size'] <= 153600)].count()
  91 + print df_ILS_T.size[df_ILS_T.size > 153600].count()
  92 +
  93 + length = df_ILS_T.shape[0]
  94 + df_ILS_T['class2'] = np.zeros(length, np.int32)
  95 + df_ILS_T.to_csv('../res/file-tag-test.tsv', header=False, index=False, sep='\t')
  96 +
  97 +
  98 +def anal_0000():
  99 + df_ILS = pd.read_csv(os.path.join(package_dir, '../res/file-tag-test.tsv'),
  100 + names=['hash', 'width', 'height', 'size', 'quality', 'chosen', 'class'],
  101 + sep='\t')
  102 + length = df_ILS.shape[0]
  103 + print df_ILS.size.describe()
  104 + print df_ILS.size.order().iloc[map(lambda x: x * length, [1.0 / 3, 2.0 / 3, 0.9999])]
  105 +
  106 + print df_ILS.size[df_ILS.size == 166500].count() / 4592.0
  107 + print df_ILS.size[df_ILS.size == 187500].count() / 4592.0
  108 + print df_ILS.size[df_ILS.size == 250000].count() / 4592.0
  109 +
  110 + print df_ILS.size[df_ILS.size <= 166500].count()
  111 + print df_ILS.size[(df_ILS['size'] > 166500) & (df_ILS['size'] <= 187500)].count()
  112 + print df_ILS.size[df_ILS.size > 187500].count()
  113 +
  114 + plt.ticklabel_format(style='sci', axis='both')
  115 + df_ILS.hist(column='size', bins=100)
  116 + plt.title('')
  117 + plt.xlabel("Image size")
  118 + plt.ylabel("Frequency")
  119 + plt.show()
  120 +
  121 +
  122 +def pre_crop():
  123 + df_ILS = pd.read_csv(os.path.join(package_dir, '../res/file-tag-test.tsv'),
  124 + names=['hash', 'width', 'height', 'size', 'quality', 'chosen', 'class'],
  125 + sep='\t')
  126 + print df_ILS.shape
  127 + print df_ILS[(df_ILS['width'] >= 300) & (df_ILS['height'] >= 300)].shape
  128 +
  129 + # 300x300 4213 0.917 *
  130 + # 200x200 4534 0.987
  131 + # 400x400 932 0.202
  132 +
  133 +
  134 +def plot_hist():
  135 + dat_performance = np.array([
  136 + [100, 0.583396, 30.847788, 57.884814, 89.315998, 1.471087, 29.364628, 9.114235, 10.585322,
  137 + 39.94995, 2.235697366],
  138 + [200, 1.147411, 62.815709, 118.217859, 182.180979, 3.008692, 37.920278, 19.589578, 22.59827,
  139 + 60.518548, 3.010332948],
  140 + [500, 2.763806, 162.806317, 299.778606, 465.348729, 6.81705, 88.291989, 73.446282,
  141 + 80.263332, 168.555321, 2.760807112],
  142 + [1000, 6.372794, 329.023151, 600.438977, 935.834922, 15.644418, 159.951099, 186.335413,
  143 + 201.979831, 361.93093, 2.585672692],
  144 + [2000, 14.960961, 679.357936, 1256.341536, 1950.660433, 31.699596, 313.154748, 387.063702,
  145 + 418.763298, 731.918046, 2.665135043],
  146 + [5000, 39.880657, 1652.537536, 3067.98039, 4760.398583, 73.070203, 694.454719, 898.458633,
  147 + 971.528836, 1665.983555, 2.857410308]])
  148 +
  149 + dat_performance = np.transpose(dat_performance)
  150 + data_size, serial_tot, spark_io, spark_proc, spark_tot = dat_performance[0], dat_performance[4], \
  151 + dat_performance[8], dat_performance[6], \
  152 + dat_performance[9]
  153 +
  154 + data_size = data_size.astype(int)
  155 + A = [spark_io, spark_proc]
  156 + E = np.arange(len(data_size))
  157 + bar_width = 0.5
  158 + # plt.bar(E, spark_io, width=bar_width)
  159 + # plt.bar(E, spark_proc, color='#e74c3c', width=bar_width, bottom=spark_io)
  160 + # plt.xlabel("Data size")
  161 + # plt.ylabel("Time(s)")
  162 + # plt.xticks(E + bar_width / 2, data_size)
  163 + # # plt.xticks(range(len(data_size)), data_size, size='small')
  164 + # # plt.ylim(ymax=300000)
  165 + # plt.show()
  166 +
  167 + # mpl.rcParams.update({'font.size': 5})
  168 +
  169 + fig, ax = plt.subplots()
  170 + rects1 = ax.bar(E, spark_io, bar_width)
  171 + rects2 = ax.bar(E, spark_proc, bar_width, color='#e74c3c', bottom=spark_io)
  172 +
  173 + # add some text for labels, title and axes ticks
  174 + plt.xlabel("Data size")
  175 + ax.set_ylabel('Time(s)')
  176 + # ax.set_title('IO ratio')
  177 + ax.set_xticks(E + bar_width / 2)
  178 + ax.set_xticklabels(data_size)
  179 +
  180 + ax.legend((rects1[0], rects2[0]), ('IO', 'CPU'), loc=2)
  181 +
  182 + height1 = [rect.get_height() for rect in rects1]
  183 + height2 = [rect.get_height() for rect in rects2]
  184 + for i in range(len(rects1)):
  185 + height = rects1[i].get_height() + rects2[i].get_height()
  186 + ax.text(rects1[i].get_x() + rects1[i].get_width() / 2, 1.005 * height, '%d%%' %
  187 + int((100 * 1.0*height1[i]/height)),
  188 + ha='center', va='bottom')
  189 +
  190 + # height1 = [rect.get_height() for rect in rects1]
  191 + # height2 = [rect.get_height() for rect in rects2]
  192 + # for i in range(len(rects1)):
  193 + # ax.text(rects1[i].get_x() + rects1[i].get_width() / 2, 0.5 * height1[i], '%f' % (0.1 *
  194 + # height1[
  195 + # i] /
  196 + # height2[
  197 + # i]),
  198 + # ha='center', va='bottom')
  199 +
  200 +
  201 + plt.show()
  202 +
  203 +
  204 +def plot_line_performance():
  205 + # performance
  206 + dat_performance = np.array([
  207 + [100, 0.583396, 30.847788, 57.884814, 89.315998, 1.471087, 29.364628, 9.114235, 10.585322,
  208 + 39.94995, 2.235697366],
  209 + [200, 1.147411, 62.815709, 118.217859, 182.180979, 3.008692, 37.920278, 19.589578, 22.59827,
  210 + 60.518548, 3.010332948],
  211 + [500, 2.763806, 162.806317, 299.778606, 465.348729, 6.81705, 88.291989, 73.446282,
  212 + 80.263332, 168.555321, 2.760807112],
  213 + [1000, 6.372794, 329.023151, 600.438977, 935.834922, 15.644418, 159.951099, 186.335413,
  214 + 201.979831, 361.93093, 2.585672692],
  215 + [2000, 14.960961, 679.357936, 1256.341536, 1950.660433, 31.699596, 313.154748, 387.063702,
  216 + 418.763298, 731.918046, 2.665135043],
  217 + [5000, 39.880657, 1652.537536, 3067.98039, 4760.398583, 73.070203, 694.454719, 898.458633,
  218 + 971.528836, 1665.983555, 2.857410308]])
  219 +
  220 + dat_performance = np.transpose(dat_performance)
  221 + data_size, serial_tot, spark_io, spark_proc, spark_tot = dat_performance[0], dat_performance[4], \
  222 + dat_performance[8], dat_performance[6], \
  223 + dat_performance[9]
  224 +
  225 + # sns.set_style("white")
  226 + # data_size = data_size.astype(int)
  227 + # plt.plot(range(len(data_size)), serial_tot, marker='o', label='serial total')
  228 + # plt.plot(range(len(data_size)), spark_tot, marker='o', linestyle='--', label='spark total')
  229 + # plt.plot(range(len(data_size)), spark_io, marker='o', linestyle=':', label='spark io')
  230 + # plt.plot(range(len(data_size)), spark_proc, marker='o', linestyle='-.', label='spark proc')
  231 + # plt.xlabel("Data size")
  232 + # plt.ylabel("Time(s)")
  233 + # plt.xticks(range(len(data_size)), data_size, size='small')
  234 + # plt.legend(loc=2)
  235 + # plt.show()
  236 +
  237 + plt.plot(data_size, serial_tot, marker='o', label='serial total')
  238 + plt.plot(data_size, spark_tot, marker='o', linestyle='--', label='spark total')
  239 + plt.plot(data_size, spark_io, marker='o', linestyle=':', label='spark io')
  240 + plt.plot(data_size, spark_proc, marker='o', linestyle='-.', label='spark proc')
  241 + plt.xlabel("Data size")
  242 + plt.ylabel("Time(s)")
  243 + plt.legend(loc=2)
  244 + plt.show()
  245 +
  246 +
  247 +def plot_line_io():
  248 + # io
  249 + dat_io = np.array([
  250 + [100, 10.585322, 29.364628, 39.94995, 10.286684, 27.079774, 37.366458, 49.995647,
  251 + 55.280739],
  252 + [200, 22.59827, 37.920278, 60.518548, 22.731275, 38.491461, 61.222736, 76.258928,
  253 + 83.836657],
  254 + [500, 80.263332, 88.291989, 168.555321, 64.610839, 88.241193, 152.852032, 177.039349,
  255 + 143.524813],
  256 + [1000, 201.979831, 159.951099, 361.93093, 172.359455, 158.694248, 331.053703, 467.126756,
  257 + 315.578952],
  258 + [2000, 418.763298, 313.154748, 731.918046, 390.990209, 313.085707, 704.075916, 802.138669,
  259 + 734.133909],
  260 + [5000, 971.528836, 694.454719, 1665.983555, 898.468232, 717.603061, 1616.071293,
  261 + 1860.610954, 1677.044038]])
  262 +
  263 + dat_io = np.transpose(dat_io)
  264 + data_size, happybase_uncomp_io, happybase_uncomp_cpu, happybase_uncomp_tot, happybase_comp_io, happybase_comp_cpu, happybase_comp_tot, dist_uncomp, dist_comp = dat_io
  265 + # data_size = data_size.astype(int)
  266 + # plt.plot(range(len(data_size)), dist_uncomp, marker='o', label='dist-uncompressed total')
  267 + # plt.plot(range(len(data_size)), dist_comp, marker='o', label='dist-compressed total')
  268 + # plt.plot(range(len(data_size)), happybase_uncomp_tot, marker='o', label='happybase-uncompressed total')
  269 + # plt.plot(range(len(data_size)), happybase_comp_tot, marker='o', label='happybase-compressed total')
  270 + #
  271 + # plt.plot(range(len(data_size)), happybase_uncomp_io, marker='o', linestyle='--',
  272 + # label='happybase-uncompressed io')
  273 + # plt.plot(range(len(data_size)), happybase_comp_io, marker='o', linestyle='--',
  274 + # label='happybase-compressed io')
  275 + # plt.plot(range(len(data_size)), happybase_uncomp_cpu, marker='o', linestyle='--',
  276 + # label='happybase-uncompressed cpu')
  277 + # plt.plot(range(len(data_size)), happybase_comp_cpu, marker='o', linestyle='--',
  278 + # label='happybase-compressed cpu')
  279 + #
  280 + # plt.xlabel("Data size")
  281 + # plt.ylabel("Time")
  282 + # plt.xticks(range(len(data_size)), data_size, size='small')
  283 + # plt.legend(loc=2)
  284 + # plt.show()
  285 +
  286 + plt.plot(data_size, dist_uncomp, marker='o', label='dist-uncompressed total')
  287 + plt.plot(data_size, dist_comp, marker='D', label='dist-compressed total')
  288 + plt.plot(data_size, happybase_uncomp_tot, marker='o', label='happybase-uncompressed total')
  289 + plt.plot(data_size, happybase_comp_tot, marker='D', label='happybase-compressed total')
  290 +
  291 + plt.plot(data_size, happybase_uncomp_io, marker='o', linestyle='--',
  292 + label='happybase-uncompressed io')
  293 + plt.plot(data_size, happybase_comp_io, marker='D', linestyle='--',
  294 + label='happybase-compressed io')
  295 + plt.plot(data_size, happybase_uncomp_cpu, marker='o', linestyle='--',
  296 + label='happybase-uncompressed cpu')
  297 + plt.plot(data_size, happybase_comp_cpu, marker='D', linestyle='--',
  298 + label='happybase-compressed cpu')
  299 +
  300 + plt.xlabel("Data size")
  301 + plt.ylabel("Time")
  302 + plt.legend(loc=2)
  303 + plt.show()
  304 +
  305 +
  306 + # plt.subplot(2, 2, 1)
  307 + # plt.plot(data_size, dist_uncomp, marker='o', label='dist-uncompressed total')
  308 + # plt.plot(data_size, dist_comp, marker='o', label='dist-compressed total')
  309 + # # plt.title('Performance with(out) Compression')
  310 + # plt.ylabel("Time")
  311 + # plt.legend(loc=2)
  312 + #
  313 + # plt.subplot(2, 2, 2)
  314 + # plt.plot(data_size, happybase_uncomp_tot, marker='o', label='happybase-uncompressed total')
  315 + # plt.plot(data_size, happybase_comp_tot, marker='o', label='happybase-compressed total')
  316 + # plt.legend(loc=2)
  317 + #
  318 + # plt.subplot(2, 2, 3)
  319 + # plt.plot(data_size, happybase_uncomp_io, marker='o', linestyle='--',
  320 + # label='happybase-uncompressed io')
  321 + # plt.plot(data_size, happybase_comp_io, marker='o', linestyle='--',
  322 + # label='happybase-compressed io')
  323 + # plt.ylabel("Time")
  324 + # plt.xlabel("Data size")
  325 + # plt.legend(loc=2)
  326 + #
  327 + # plt.subplot(2, 2, 4)
  328 + # plt.plot(data_size, happybase_uncomp_cpu, marker='o', linestyle='--',
  329 + # label='happybase-uncompressed cpu')
  330 + # plt.plot(data_size, happybase_comp_cpu, marker='o', linestyle='--',
  331 + # label='happybase-compressed cpu')
  332 + # plt.xlabel("Data size")
  333 + # plt.legend(loc=2)
  334 + # plt.show()
  335 +
  336 + # plt.plot(data_size, dist_uncomp, marker='o', label='dist-uncompressed total')
  337 + # plt.plot(data_size, dist_comp, marker='D', linestyle='--',label='dist-compressed total')
  338 + # plt.xlabel("Data size")
  339 + # plt.ylabel("Time(s)")
  340 + # plt.legend(loc=2)
  341 + # plt.show()
  342 + #
  343 + # plt.plot(data_size, happybase_uncomp_tot, marker='o', label='happybase-uncompressed total')
  344 + # plt.plot(data_size, happybase_comp_tot, marker='D', linestyle='--',label='happybase-compressed total')
  345 + # plt.xlabel("Data size")
  346 + # plt.ylabel("Time(s)")
  347 + # plt.legend(loc=2)
  348 + # plt.show()
  349 + #
  350 + # plt.plot(data_size, happybase_uncomp_io, marker='o',
  351 + # label='happybase-uncompressed io')
  352 + # plt.plot(data_size, happybase_comp_io, marker='D', linestyle='--',
  353 + # label='happybase-compressed io')
  354 + # plt.xlabel("Data size")
  355 + # plt.ylabel("Time(s)")
  356 + # plt.legend(loc=2)
  357 + # plt.show()
  358 + #
  359 + # plt.plot(data_size, happybase_uncomp_cpu, marker='o',
  360 + # label='happybase-uncompressed cpu')
  361 + # plt.plot(data_size, happybase_comp_cpu, marker='D', linestyle='--',
  362 + # label='happybase-compressed cpu')
  363 + #
  364 + # plt.xlabel("Data size")
  365 + # plt.ylabel("Time(s)")
  366 + # plt.legend(loc=2)
  367 + # plt.show()
  368 +
  369 +
  370 +if __name__ == '__main__':
  371 + # anal_ILSVRC()
  372 + # anal_ILSVRC_Test()
  373 + # anal_0000()
  374 + # print timeit.timeit("anal_ILSVRC()", setup="from __main__ import anal_ILSVRC", number=1)
  375 +
  376 +
  377 + # pre_crop()
  378 + # plot_line()
  379 + anal_0000()
  380 + pass
... ...