From 92d488d8731a722a81487a5150ce7775941940ee Mon Sep 17 00:00:00 2001
From: Chunk <chunkplus@gmail.com>
Date: Mon, 15 Jun 2015 18:05:22 +0800
Subject: [PATCH] .

---
 mdata/ANALYSIS.py | 380 ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 380 insertions(+), 0 deletions(-)
 create mode 100644 mdata/ANALYSIS.py
diff --git a/mdata/ANALYSIS.py b/mdata/ANALYSIS.py
new file mode 100644
index 0000000..4668b44
--- /dev/null
+++ b/mdata/ANALYSIS.py
@@ -0,0 +1,380 @@
+__author__ = 'chunk'
+
+import os
+import numpy as np
+from numpy.random import randn
+import pandas as pd
+from scipy import stats
+import matplotlib as mpl
+import matplotlib.pyplot as plt
+import seaborn as sns
+
+import numpy as np
+import matplotlib.pyplot as plt
+import seaborn as sns
+from .. import mjpeg
+from ..mjpeg import base
+from ..msteg.steganography import LSB, F3, F4, F5
+
+np.random.seed(sum(map(ord, "whoami")))
+
+sample_key = [46812L, 20559L, 31360L, 16681L, 27536L, 39553L, 5427L, 63029L, 56572L, 36476L, 25695L,
+              61908L, 63014L, 5908L, 59816L, 56765L]
+
+# plt.ticklabel_format(style='sci', axis='both', scilimits=(0, 0))
+plt.ticklabel_format(style='sci', axis='both')
+
+package_dir = os.path.dirname(os.path.abspath(__file__))
+
+
+def anal_ILSVRC():
+    df_ILS = pd.read_csv('../res/file-tag.tsv',
+                         names=['hash', 'width', 'height', 'size', 'quality'], sep='\t')
+    print df_ILS[df_ILS.size < 2000000]
+    print df_ILS.describe()
+    # df_ILS.boxplot(column='size')
+    # plt.show()
+
+    length = df_ILS.shape[0]
+
+    # print type(df_ILS.size.order()) # <class 'pandas.core.series.Series'>
+    print df_ILS.size.order().iloc[map(lambda x: x * length, [1.0 / 3, 2.0 / 3, 0.9999])]
+    """
+    7082     108514
+    3826     150389
+    8761    4814541
+    """
+
+    print df_ILS.size[df_ILS.size <= 102400].count()
+    print df_ILS.size[(df_ILS['size'] > 102400) & (df_ILS['size'] <= 153600)].count()
+    print df_ILS.size[df_ILS.size > 153600].count()
+
+    """
+    (-,100K,150K,+):
+        4519
+        6163
+        4831
+    (-,100K,500K,+):
+        4519
+        10932
+        62
+    """
+
+    ## Quality
+    print df_ILS.quality.order().iloc[map(lambda x: x * length, [1.0 / 3, 2.0 / 3, 0.9999])]
+    """
+    13507     96
+    831       96
+    6529     100
+    """
+    df_new = df_ILS.sort(['size', 'quality'], ascending=True)
+    print df_new
+
+    rand_class = stats.bernoulli.rvs(0.3, size=length)
+    # df_new['class'] = pd.Series(rand_class, index=df_new.index)
+    df_new['class'] = rand_class
+
+    print rand_class[:100]
+    print df_new
+
+    df_new.to_csv('../res/test.tsv', header=False, index=False, sep='\t')
+
+
+def anal_ILSVRC_Test():
+    df_ILS_T = pd.read_csv('../res/file-tag-test.tsv',
+                           names=['hash', 'width', 'height', 'size', 'quality', 'class'], sep='\t')
+    print df_ILS_T
+    print df_ILS_T.size.describe()
+
+    print df_ILS_T.size[df_ILS_T.size <= 102400].count()
+    print df_ILS_T.size[(df_ILS_T['size'] > 102400) & (df_ILS_T['size'] <= 153600)].count()
+    print df_ILS_T.size[df_ILS_T.size > 153600].count()
+
+    length = df_ILS_T.shape[0]
+    df_ILS_T['class2'] = np.zeros(length, np.int32)
+    df_ILS_T.to_csv('../res/file-tag-test.tsv', header=False, index=False, sep='\t')
+
+
+def anal_0000():
+    df_ILS = pd.read_csv(os.path.join(package_dir, '../res/file-tag-test.tsv'),
+                         names=['hash', 'width', 'height', 'size', 'quality', 'chosen', 'class'],
+                         sep='\t')
+    length = df_ILS.shape[0]
+    print df_ILS.size.describe()
+    print df_ILS.size.order().iloc[map(lambda x: x * length, [1.0 / 3, 2.0 / 3, 0.9999])]
+
+    print df_ILS.size[df_ILS.size == 166500].count() / 4592.0
+    print df_ILS.size[df_ILS.size == 187500].count() / 4592.0
+    print df_ILS.size[df_ILS.size == 250000].count() / 4592.0
+
+    print df_ILS.size[df_ILS.size <= 166500].count()
+    print df_ILS.size[(df_ILS['size'] > 166500) & (df_ILS['size'] <= 187500)].count()
+    print df_ILS.size[df_ILS.size > 187500].count()
+
+    plt.ticklabel_format(style='sci', axis='both')
+    df_ILS.hist(column='size', bins=100)
+    plt.title('')
+    plt.xlabel("Image size")
+    plt.ylabel("Frequency")
+    plt.show()
+
+
+def pre_crop():
+    df_ILS = pd.read_csv(os.path.join(package_dir, '../res/file-tag-test.tsv'),
+                         names=['hash', 'width', 'height', 'size', 'quality', 'chosen', 'class'],
+                         sep='\t')
+    print df_ILS.shape
+    print df_ILS[(df_ILS['width'] >= 300) & (df_ILS['height'] >= 300)].shape
+
+    # 300x300 4213 0.917 *
+    # 200x200 4534 0.987
+    # 400x400 932 0.202
+
+
+def plot_hist():
+    dat_performance = np.array([
+        [100, 0.583396, 30.847788, 57.884814, 89.315998, 1.471087, 29.364628, 9.114235, 10.585322,
+         39.94995, 2.235697366],
+        [200, 1.147411, 62.815709, 118.217859, 182.180979, 3.008692, 37.920278, 19.589578, 22.59827,
+         60.518548, 3.010332948],
+        [500, 2.763806, 162.806317, 299.778606, 465.348729, 6.81705, 88.291989, 73.446282,
+         80.263332, 168.555321, 2.760807112],
+        [1000, 6.372794, 329.023151, 600.438977, 935.834922, 15.644418, 159.951099, 186.335413,
+         201.979831, 361.93093, 2.585672692],
+        [2000, 14.960961, 679.357936, 1256.341536, 1950.660433, 31.699596, 313.154748, 387.063702,
+         418.763298, 731.918046, 2.665135043],
+        [5000, 39.880657, 1652.537536, 3067.98039, 4760.398583, 73.070203, 694.454719, 898.458633,
+         971.528836, 1665.983555, 2.857410308]])
+
+    dat_performance = np.transpose(dat_performance)
+    data_size, serial_tot, spark_io, spark_proc, spark_tot = dat_performance[0], dat_performance[4], \
+                                                             dat_performance[8], dat_performance[6], \
+                                                             dat_performance[9]
+
+    data_size = data_size.astype(int)
+    A = [spark_io, spark_proc]
+    E = np.arange(len(data_size))
+    bar_width = 0.5
+    # plt.bar(E, spark_io, width=bar_width)
+    # plt.bar(E, spark_proc, color='#e74c3c', width=bar_width, bottom=spark_io)
+    # plt.xlabel("Data size")
+    # plt.ylabel("Time(s)")
+    # plt.xticks(E + bar_width / 2, data_size)
+    # # plt.xticks(range(len(data_size)), data_size, size='small')
+    # # plt.ylim(ymax=300000)
+    # plt.show()
+
+    # mpl.rcParams.update({'font.size': 5})
+
+    fig, ax = plt.subplots()
+    rects1 = ax.bar(E, spark_io, bar_width)
+    rects2 = ax.bar(E, spark_proc, bar_width, color='#e74c3c', bottom=spark_io)
+
+    # add some text for labels, title and axes ticks
+    plt.xlabel("Data size")
+    ax.set_ylabel('Time(s)')
+    # ax.set_title('IO ratio')
+    ax.set_xticks(E + bar_width / 2)
+    ax.set_xticklabels(data_size)
+
+    ax.legend((rects1[0], rects2[0]), ('IO', 'CPU'), loc=2)
+
+    height1 = [rect.get_height() for rect in rects1]
+    height2 = [rect.get_height() for rect in rects2]
+    for i in range(len(rects1)):
+        height = rects1[i].get_height() + rects2[i].get_height()
+        ax.text(rects1[i].get_x() + rects1[i].get_width() / 2, 1.005 * height, '%d%%' %
+                int((100 * 1.0*height1[i]/height)),
+                ha='center', va='bottom')
+
+    # height1 = [rect.get_height() for rect in rects1]
+    # height2 = [rect.get_height() for rect in rects2]
+    # for i in range(len(rects1)):
+    #     ax.text(rects1[i].get_x() + rects1[i].get_width() / 2, 0.5 * height1[i], '%f' % (0.1 *
+    #                                                                                      height1[
+    #                                                                                          i] /
+    #                                                                                      height2[
+    #                                                                                          i]),
+    #             ha='center', va='bottom')
+
+
+    plt.show()
+
+
+def plot_line_performance():
+    # performance
+    dat_performance = np.array([
+        [100, 0.583396, 30.847788, 57.884814, 89.315998, 1.471087, 29.364628, 9.114235, 10.585322,
+         39.94995, 2.235697366],
+        [200, 1.147411, 62.815709, 118.217859, 182.180979, 3.008692, 37.920278, 19.589578, 22.59827,
+         60.518548, 3.010332948],
+        [500, 2.763806, 162.806317, 299.778606, 465.348729, 6.81705, 88.291989, 73.446282,
+         80.263332, 168.555321, 2.760807112],
+        [1000, 6.372794, 329.023151, 600.438977, 935.834922, 15.644418, 159.951099, 186.335413,
+         201.979831, 361.93093, 2.585672692],
+        [2000, 14.960961, 679.357936, 1256.341536, 1950.660433, 31.699596, 313.154748, 387.063702,
+         418.763298, 731.918046, 2.665135043],
+        [5000, 39.880657, 1652.537536, 3067.98039, 4760.398583, 73.070203, 694.454719, 898.458633,
+         971.528836, 1665.983555, 2.857410308]])
+
+    dat_performance = np.transpose(dat_performance)
+    data_size, serial_tot, spark_io, spark_proc, spark_tot = dat_performance[0], dat_performance[4], \
+                                                             dat_performance[8], dat_performance[6], \
+                                                             dat_performance[9]
+
+    # sns.set_style("white")
+    # data_size = data_size.astype(int)
+    # plt.plot(range(len(data_size)), serial_tot, marker='o', label='serial total')
+    # plt.plot(range(len(data_size)), spark_tot, marker='o', linestyle='--', label='spark total')
+    # plt.plot(range(len(data_size)), spark_io, marker='o', linestyle=':', label='spark io')
+    # plt.plot(range(len(data_size)), spark_proc, marker='o', linestyle='-.', label='spark proc')
+    # plt.xlabel("Data size")
+    # plt.ylabel("Time(s)")
+    # plt.xticks(range(len(data_size)), data_size, size='small')
+    # plt.legend(loc=2)
+    # plt.show()
+
+    plt.plot(data_size, serial_tot, marker='o', label='serial total')
+    plt.plot(data_size, spark_tot, marker='o', linestyle='--', label='spark total')
+    plt.plot(data_size, spark_io, marker='o', linestyle=':', label='spark io')
+    plt.plot(data_size, spark_proc, marker='o', linestyle='-.', label='spark proc')
+    plt.xlabel("Data size")
+    plt.ylabel("Time(s)")
+    plt.legend(loc=2)
+    plt.show()
+
+
+def plot_line_io():
+    # io
+    dat_io = np.array([
+        [100, 10.585322, 29.364628, 39.94995, 10.286684, 27.079774, 37.366458, 49.995647,
+         55.280739],
+        [200, 22.59827, 37.920278, 60.518548, 22.731275, 38.491461, 61.222736, 76.258928,
+         83.836657],
+        [500, 80.263332, 88.291989, 168.555321, 64.610839, 88.241193, 152.852032, 177.039349,
+         143.524813],
+        [1000, 201.979831, 159.951099, 361.93093, 172.359455, 158.694248, 331.053703, 467.126756,
+         315.578952],
+        [2000, 418.763298, 313.154748, 731.918046, 390.990209, 313.085707, 704.075916, 802.138669,
+         734.133909],
+        [5000, 971.528836, 694.454719, 1665.983555, 898.468232, 717.603061, 1616.071293,
+         1860.610954, 1677.044038]])
+
+    dat_io = np.transpose(dat_io)
+    data_size, happybase_uncomp_io, happybase_uncomp_cpu, happybase_uncomp_tot, happybase_comp_io, happybase_comp_cpu, happybase_comp_tot, dist_uncomp, dist_comp = dat_io
+    # data_size = data_size.astype(int)
+    # plt.plot(range(len(data_size)), dist_uncomp, marker='o', label='dist-uncompressed total')
+    # plt.plot(range(len(data_size)), dist_comp, marker='o', label='dist-compressed total')
+    # plt.plot(range(len(data_size)), happybase_uncomp_tot, marker='o', label='happybase-uncompressed total')
+    # plt.plot(range(len(data_size)), happybase_comp_tot, marker='o', label='happybase-compressed total')
+    #
+    # plt.plot(range(len(data_size)), happybase_uncomp_io, marker='o', linestyle='--',
+    #          label='happybase-uncompressed io')
+    # plt.plot(range(len(data_size)), happybase_comp_io, marker='o', linestyle='--',
+    #          label='happybase-compressed io')
+    # plt.plot(range(len(data_size)), happybase_uncomp_cpu, marker='o', linestyle='--',
+    #          label='happybase-uncompressed cpu')
+    # plt.plot(range(len(data_size)), happybase_comp_cpu, marker='o', linestyle='--',
+    #          label='happybase-compressed cpu')
+    #
+    # plt.xlabel("Data size")
+    # plt.ylabel("Time")
+    # plt.xticks(range(len(data_size)), data_size, size='small')
+    # plt.legend(loc=2)
+    # plt.show()
+
+    plt.plot(data_size, dist_uncomp, marker='o', label='dist-uncompressed total')
+    plt.plot(data_size, dist_comp, marker='D', label='dist-compressed total')
+    plt.plot(data_size, happybase_uncomp_tot, marker='o', label='happybase-uncompressed total')
+    plt.plot(data_size, happybase_comp_tot, marker='D', label='happybase-compressed total')
+
+    plt.plot(data_size, happybase_uncomp_io, marker='o', linestyle='--',
+             label='happybase-uncompressed io')
+    plt.plot(data_size, happybase_comp_io, marker='D', linestyle='--',
+             label='happybase-compressed io')
+    plt.plot(data_size, happybase_uncomp_cpu, marker='o', linestyle='--',
+             label='happybase-uncompressed cpu')
+    plt.plot(data_size, happybase_comp_cpu, marker='D', linestyle='--',
+             label='happybase-compressed cpu')
+
+    plt.xlabel("Data size")
+    plt.ylabel("Time")
+    plt.legend(loc=2)
+    plt.show()
+
+
+    # plt.subplot(2, 2, 1)
+    # plt.plot(data_size, dist_uncomp, marker='o', label='dist-uncompressed total')
+    # plt.plot(data_size, dist_comp, marker='o', label='dist-compressed total')
+    # # plt.title('Performance with(out) Compression')
+    # plt.ylabel("Time")
+    # plt.legend(loc=2)
+    #
+    # plt.subplot(2, 2, 2)
+    # plt.plot(data_size, happybase_uncomp_tot, marker='o', label='happybase-uncompressed total')
+    # plt.plot(data_size, happybase_comp_tot, marker='o', label='happybase-compressed total')
+    # plt.legend(loc=2)
+    #
+    # plt.subplot(2, 2, 3)
+    # plt.plot(data_size, happybase_uncomp_io, marker='o', linestyle='--',
+    #          label='happybase-uncompressed io')
+    # plt.plot(data_size, happybase_comp_io, marker='o', linestyle='--',
+    #          label='happybase-compressed io')
+    # plt.ylabel("Time")
+    # plt.xlabel("Data size")
+    # plt.legend(loc=2)
+    #
+    # plt.subplot(2, 2, 4)
+    # plt.plot(data_size, happybase_uncomp_cpu, marker='o', linestyle='--',
+    #          label='happybase-uncompressed cpu')
+    # plt.plot(data_size, happybase_comp_cpu, marker='o', linestyle='--',
+    #          label='happybase-compressed cpu')
+    # plt.xlabel("Data size")
+    # plt.legend(loc=2)
+    # plt.show()
+
+    # plt.plot(data_size, dist_uncomp, marker='o', label='dist-uncompressed total')
+    # plt.plot(data_size, dist_comp, marker='D', linestyle='--',label='dist-compressed total')
+    # plt.xlabel("Data size")
+    # plt.ylabel("Time(s)")
+    # plt.legend(loc=2)
+    # plt.show()
+    #
+    # plt.plot(data_size, happybase_uncomp_tot, marker='o', label='happybase-uncompressed total')
+    # plt.plot(data_size, happybase_comp_tot, marker='D', linestyle='--',label='happybase-compressed total')
+    # plt.xlabel("Data size")
+    # plt.ylabel("Time(s)")
+    # plt.legend(loc=2)
+    # plt.show()
+    #
+    # plt.plot(data_size, happybase_uncomp_io, marker='o',
+    #          label='happybase-uncompressed io')
+    # plt.plot(data_size, happybase_comp_io, marker='D', linestyle='--',
+    #          label='happybase-compressed io')
+    # plt.xlabel("Data size")
+    # plt.ylabel("Time(s)")
+    # plt.legend(loc=2)
+    # plt.show()
+    #
+    # plt.plot(data_size, happybase_uncomp_cpu, marker='o',
+    #          label='happybase-uncompressed cpu')
+    # plt.plot(data_size, happybase_comp_cpu, marker='D', linestyle='--',
+    #          label='happybase-compressed cpu')
+    #
+    # plt.xlabel("Data size")
+    # plt.ylabel("Time(s)")
+    # plt.legend(loc=2)
+    # plt.show()
+
+
+if __name__ == '__main__':
+    # anal_ILSVRC()
+    # anal_ILSVRC_Test()
+    # anal_0000()
+    # print timeit.timeit("anal_ILSVRC()", setup="from __main__ import anal_ILSVRC", number=1)
+
+
+    # pre_crop()
+    # plot_line()
+    anal_0000()
+    pass
--
libgit2 0.21.2