.

Chunk
1 parent 7e755616
Showing 1 changed file with 380 additions and 0 deletions Show diff stats
mdata/ANALYSIS.py
@@ -0,0 +1,380 @@
+__author__ = 'chunk'
+
+import os
+import numpy as np
+from numpy.random import randn
+import pandas as pd
+from scipy import stats
+import matplotlib as mpl
+import matplotlib.pyplot as plt
+import seaborn as sns
+
+import numpy as np
+import matplotlib.pyplot as plt
+import seaborn as sns
+from .. import mjpeg
+from ..mjpeg import base
+from ..msteg.steganography import LSB, F3, F4, F5
+
+np.random.seed(sum(map(ord, "whoami")))
+
+sample_key = [46812L, 20559L, 31360L, 16681L, 27536L, 39553L, 5427L, 63029L, 56572L, 36476L, 25695L,
+              61908L, 63014L, 5908L, 59816L, 56765L]
+
+# plt.ticklabel_format(style='sci', axis='both', scilimits=(0, 0))
+plt.ticklabel_format(style='sci', axis='both')
+
+package_dir = os.path.dirname(os.path.abspath(__file__))
+
+
+def anal_ILSVRC():
+    df_ILS = pd.read_csv('../res/file-tag.tsv',
+                         names=['hash', 'width', 'height', 'size', 'quality'], sep='\t')
+    print df_ILS[df_ILS.size < 2000000]
+    print df_ILS.describe()
+    # df_ILS.boxplot(column='size')
+    # plt.show()
+
+    length = df_ILS.shape[0]
+
+    # print type(df_ILS.size.order()) # <class 'pandas.core.series.Series'>
+    print df_ILS.size.order().iloc[map(lambda x: x * length, [1.0 / 3, 2.0 / 3, 0.9999])]
+    """
+    7082     108514
+    3826     150389
+    8761    4814541
+    """
+
+    print df_ILS.size[df_ILS.size <= 102400].count()
+    print df_ILS.size[(df_ILS['size'] > 102400) & (df_ILS['size'] <= 153600)].count()
+    print df_ILS.size[df_ILS.size > 153600].count()
+
+    """
+    (-,100K,150K,+):
+        4519
+        6163
+        4831
+    (-,100K,500K,+):
+        4519
+        10932
+        62
+    """
+
+    ## Quality
+    print df_ILS.quality.order().iloc[map(lambda x: x * length, [1.0 / 3, 2.0 / 3, 0.9999])]
+    """
+    13507     96
+    831       96
+    6529     100
+    """
+    df_new = df_ILS.sort(['size', 'quality'], ascending=True)
+    print df_new
+
+    rand_class = stats.bernoulli.rvs(0.3, size=length)
+    # df_new['class'] = pd.Series(rand_class, index=df_new.index)
+    df_new['class'] = rand_class
+
+    print rand_class[:100]
+    print df_new
+
+    df_new.to_csv('../res/test.tsv', header=False, index=False, sep='\t')
+
+
+def anal_ILSVRC_Test():
+    df_ILS_T = pd.read_csv('../res/file-tag-test.tsv',
+                           names=['hash', 'width', 'height', 'size', 'quality', 'class'], sep='\t')
+    print df_ILS_T
+    print df_ILS_T.size.describe()
+
+    print df_ILS_T.size[df_ILS_T.size <= 102400].count()
+    print df_ILS_T.size[(df_ILS_T['size'] > 102400) & (df_ILS_T['size'] <= 153600)].count()
+    print df_ILS_T.size[df_ILS_T.size > 153600].count()
+
+    length = df_ILS_T.shape[0]
+    df_ILS_T['class2'] = np.zeros(length, np.int32)
+    df_ILS_T.to_csv('../res/file-tag-test.tsv', header=False, index=False, sep='\t')
+
+
+def anal_0000():
+    df_ILS = pd.read_csv(os.path.join(package_dir, '../res/file-tag-test.tsv'),
+                         names=['hash', 'width', 'height', 'size', 'quality', 'chosen', 'class'],
+                         sep='\t')
+    length = df_ILS.shape[0]
+    print df_ILS.size.describe()
+    print df_ILS.size.order().iloc[map(lambda x: x * length, [1.0 / 3, 2.0 / 3, 0.9999])]
+
+    print df_ILS.size[df_ILS.size == 166500].count() / 4592.0
+    print df_ILS.size[df_ILS.size == 187500].count() / 4592.0
+    print df_ILS.size[df_ILS.size == 250000].count() / 4592.0
+
+    print df_ILS.size[df_ILS.size <= 166500].count()
+    print df_ILS.size[(df_ILS['size'] > 166500) & (df_ILS['size'] <= 187500)].count()
+    print df_ILS.size[df_ILS.size > 187500].count()
+
+    plt.ticklabel_format(style='sci', axis='both')
+    df_ILS.hist(column='size', bins=100)
+    plt.title('')
+    plt.xlabel("Image size")
+    plt.ylabel("Frequency")
+    plt.show()
+
+
+def pre_crop():
+    df_ILS = pd.read_csv(os.path.join(package_dir, '../res/file-tag-test.tsv'),
+                         names=['hash', 'width', 'height', 'size', 'quality', 'chosen', 'class'],
+                         sep='\t')
+    print df_ILS.shape
+    print df_ILS[(df_ILS['width'] >= 300) & (df_ILS['height'] >= 300)].shape
+
+    # 300x300 4213 0.917 *
+    # 200x200 4534 0.987
+    # 400x400 932 0.202
+
+
+def plot_hist():
+    dat_performance = np.array([
+        [100, 0.583396, 30.847788, 57.884814, 89.315998, 1.471087, 29.364628, 9.114235, 10.585322,
+         39.94995, 2.235697366],
+        [200, 1.147411, 62.815709, 118.217859, 182.180979, 3.008692, 37.920278, 19.589578, 22.59827,
+         60.518548, 3.010332948],
+        [500, 2.763806, 162.806317, 299.778606, 465.348729, 6.81705, 88.291989, 73.446282,
+         80.263332, 168.555321, 2.760807112],
+        [1000, 6.372794, 329.023151, 600.438977, 935.834922, 15.644418, 159.951099, 186.335413,
+         201.979831, 361.93093, 2.585672692],
+        [2000, 14.960961, 679.357936, 1256.341536, 1950.660433, 31.699596, 313.154748, 387.063702,
+         418.763298, 731.918046, 2.665135043],
+        [5000, 39.880657, 1652.537536, 3067.98039, 4760.398583, 73.070203, 694.454719, 898.458633,
+         971.528836, 1665.983555, 2.857410308]])
+
+    dat_performance = np.transpose(dat_performance)
+    data_size, serial_tot, spark_io, spark_proc, spark_tot = dat_performance[0], dat_performance[4], \
+                                                             dat_performance[8], dat_performance[6], \
+                                                             dat_performance[9]
+
+    data_size = data_size.astype(int)
+    A = [spark_io, spark_proc]
+    E = np.arange(len(data_size))
+    bar_width = 0.5
+    # plt.bar(E, spark_io, width=bar_width)
+    # plt.bar(E, spark_proc, color='#e74c3c', width=bar_width, bottom=spark_io)
+    # plt.xlabel("Data size")
+    # plt.ylabel("Time(s)")
+    # plt.xticks(E + bar_width / 2, data_size)
+    # # plt.xticks(range(len(data_size)), data_size, size='small')
+    # # plt.ylim(ymax=300000)
+    # plt.show()
+
+    # mpl.rcParams.update({'font.size': 5})
+
+    fig, ax = plt.subplots()
+    rects1 = ax.bar(E, spark_io, bar_width)
+    rects2 = ax.bar(E, spark_proc, bar_width, color='#e74c3c', bottom=spark_io)
+
+    # add some text for labels, title and axes ticks
+    plt.xlabel("Data size")
+    ax.set_ylabel('Time(s)')
+    # ax.set_title('IO ratio')
+    ax.set_xticks(E + bar_width / 2)
+    ax.set_xticklabels(data_size)
+
+    ax.legend((rects1[0], rects2[0]), ('IO', 'CPU'), loc=2)
+
+    height1 = [rect.get_height() for rect in rects1]
+    height2 = [rect.get_height() for rect in rects2]
+    for i in range(len(rects1)):
+        height = rects1[i].get_height() + rects2[i].get_height()
+        ax.text(rects1[i].get_x() + rects1[i].get_width() / 2, 1.005 * height, '%d%%' %
+                int((100 * 1.0*height1[i]/height)),
+                ha='center', va='bottom')
+
+    # height1 = [rect.get_height() for rect in rects1]
+    # height2 = [rect.get_height() for rect in rects2]
+    # for i in range(len(rects1)):
+    #     ax.text(rects1[i].get_x() + rects1[i].get_width() / 2, 0.5 * height1[i], '%f' % (0.1 *
+    #                                                                                      height1[
+    #                                                                                          i] /
+    #                                                                                      height2[
+    #                                                                                          i]),
+    #             ha='center', va='bottom')
+
+
+    plt.show()
+
+
+def plot_line_performance():
+    # performance
+    dat_performance = np.array([
+        [100, 0.583396, 30.847788, 57.884814, 89.315998, 1.471087, 29.364628, 9.114235, 10.585322,
+         39.94995, 2.235697366],
+        [200, 1.147411, 62.815709, 118.217859, 182.180979, 3.008692, 37.920278, 19.589578, 22.59827,
+         60.518548, 3.010332948],
+        [500, 2.763806, 162.806317, 299.778606, 465.348729, 6.81705, 88.291989, 73.446282,
+         80.263332, 168.555321, 2.760807112],
+        [1000, 6.372794, 329.023151, 600.438977, 935.834922, 15.644418, 159.951099, 186.335413,
+         201.979831, 361.93093, 2.585672692],
+        [2000, 14.960961, 679.357936, 1256.341536, 1950.660433, 31.699596, 313.154748, 387.063702,
+         418.763298, 731.918046, 2.665135043],
+        [5000, 39.880657, 1652.537536, 3067.98039, 4760.398583, 73.070203, 694.454719, 898.458633,
+         971.528836, 1665.983555, 2.857410308]])
+
+    dat_performance = np.transpose(dat_performance)
+    data_size, serial_tot, spark_io, spark_proc, spark_tot = dat_performance[0], dat_performance[4], \
+                                                             dat_performance[8], dat_performance[6], \
+                                                             dat_performance[9]
+
+    # sns.set_style("white")
+    # data_size = data_size.astype(int)
+    # plt.plot(range(len(data_size)), serial_tot, marker='o', label='serial total')
+    # plt.plot(range(len(data_size)), spark_tot, marker='o', linestyle='--', label='spark total')
+    # plt.plot(range(len(data_size)), spark_io, marker='o', linestyle=':', label='spark io')
+    # plt.plot(range(len(data_size)), spark_proc, marker='o', linestyle='-.', label='spark proc')
+    # plt.xlabel("Data size")
+    # plt.ylabel("Time(s)")
+    # plt.xticks(range(len(data_size)), data_size, size='small')
+    # plt.legend(loc=2)
+    # plt.show()
+
+    plt.plot(data_size, serial_tot, marker='o', label='serial total')
+    plt.plot(data_size, spark_tot, marker='o', linestyle='--', label='spark total')
+    plt.plot(data_size, spark_io, marker='o', linestyle=':', label='spark io')
+    plt.plot(data_size, spark_proc, marker='o', linestyle='-.', label='spark proc')
+    plt.xlabel("Data size")
+    plt.ylabel("Time(s)")
+    plt.legend(loc=2)
+    plt.show()
+
+
+def plot_line_io():
+    # io
+    dat_io = np.array([
+        [100, 10.585322, 29.364628, 39.94995, 10.286684, 27.079774, 37.366458, 49.995647,
+         55.280739],
+        [200, 22.59827, 37.920278, 60.518548, 22.731275, 38.491461, 61.222736, 76.258928,
+         83.836657],
+        [500, 80.263332, 88.291989, 168.555321, 64.610839, 88.241193, 152.852032, 177.039349,
+         143.524813],
+        [1000, 201.979831, 159.951099, 361.93093, 172.359455, 158.694248, 331.053703, 467.126756,
+         315.578952],
+        [2000, 418.763298, 313.154748, 731.918046, 390.990209, 313.085707, 704.075916, 802.138669,
+         734.133909],
+        [5000, 971.528836, 694.454719, 1665.983555, 898.468232, 717.603061, 1616.071293,
+         1860.610954, 1677.044038]])
+
+    dat_io = np.transpose(dat_io)
+    data_size, happybase_uncomp_io, happybase_uncomp_cpu, happybase_uncomp_tot, happybase_comp_io, happybase_comp_cpu, happybase_comp_tot, dist_uncomp, dist_comp = dat_io
+    # data_size = data_size.astype(int)
+    # plt.plot(range(len(data_size)), dist_uncomp, marker='o', label='dist-uncompressed total')
+    # plt.plot(range(len(data_size)), dist_comp, marker='o', label='dist-compressed total')
+    # plt.plot(range(len(data_size)), happybase_uncomp_tot, marker='o', label='happybase-uncompressed total')
+    # plt.plot(range(len(data_size)), happybase_comp_tot, marker='o', label='happybase-compressed total')
+    #
+    # plt.plot(range(len(data_size)), happybase_uncomp_io, marker='o', linestyle='--',
+    #          label='happybase-uncompressed io')
+    # plt.plot(range(len(data_size)), happybase_comp_io, marker='o', linestyle='--',
+    #          label='happybase-compressed io')
+    # plt.plot(range(len(data_size)), happybase_uncomp_cpu, marker='o', linestyle='--',
+    #          label='happybase-uncompressed cpu')
+    # plt.plot(range(len(data_size)), happybase_comp_cpu, marker='o', linestyle='--',
+    #          label='happybase-compressed cpu')
+    #
+    # plt.xlabel("Data size")
+    # plt.ylabel("Time")
+    # plt.xticks(range(len(data_size)), data_size, size='small')
+    # plt.legend(loc=2)
+    # plt.show()
+
+    plt.plot(data_size, dist_uncomp, marker='o', label='dist-uncompressed total')
+    plt.plot(data_size, dist_comp, marker='D', label='dist-compressed total')
+    plt.plot(data_size, happybase_uncomp_tot, marker='o', label='happybase-uncompressed total')
+    plt.plot(data_size, happybase_comp_tot, marker='D', label='happybase-compressed total')
+
+    plt.plot(data_size, happybase_uncomp_io, marker='o', linestyle='--',
+             label='happybase-uncompressed io')
+    plt.plot(data_size, happybase_comp_io, marker='D', linestyle='--',
+             label='happybase-compressed io')
+    plt.plot(data_size, happybase_uncomp_cpu, marker='o', linestyle='--',
+             label='happybase-uncompressed cpu')
+    plt.plot(data_size, happybase_comp_cpu, marker='D', linestyle='--',
+             label='happybase-compressed cpu')
+
+    plt.xlabel("Data size")
+    plt.ylabel("Time")
+    plt.legend(loc=2)
+    plt.show()
+
+
+    # plt.subplot(2, 2, 1)
+    # plt.plot(data_size, dist_uncomp, marker='o', label='dist-uncompressed total')
+    # plt.plot(data_size, dist_comp, marker='o', label='dist-compressed total')
+    # # plt.title('Performance with(out) Compression')
+    # plt.ylabel("Time")
+    # plt.legend(loc=2)
+    #
+    # plt.subplot(2, 2, 2)
+    # plt.plot(data_size, happybase_uncomp_tot, marker='o', label='happybase-uncompressed total')
+    # plt.plot(data_size, happybase_comp_tot, marker='o', label='happybase-compressed total')
+    # plt.legend(loc=2)
+    #
+    # plt.subplot(2, 2, 3)
+    # plt.plot(data_size, happybase_uncomp_io, marker='o', linestyle='--',
+    #          label='happybase-uncompressed io')
+    # plt.plot(data_size, happybase_comp_io, marker='o', linestyle='--',
+    #          label='happybase-compressed io')
+    # plt.ylabel("Time")
+    # plt.xlabel("Data size")
+    # plt.legend(loc=2)
+    #
+    # plt.subplot(2, 2, 4)
+    # plt.plot(data_size, happybase_uncomp_cpu, marker='o', linestyle='--',
+    #          label='happybase-uncompressed cpu')
+    # plt.plot(data_size, happybase_comp_cpu, marker='o', linestyle='--',
+    #          label='happybase-compressed cpu')
+    # plt.xlabel("Data size")
+    # plt.legend(loc=2)
+    # plt.show()
+
+    # plt.plot(data_size, dist_uncomp, marker='o', label='dist-uncompressed total')
+    # plt.plot(data_size, dist_comp, marker='D', linestyle='--',label='dist-compressed total')
+    # plt.xlabel("Data size")
+    # plt.ylabel("Time(s)")
+    # plt.legend(loc=2)
+    # plt.show()
+    #
+    # plt.plot(data_size, happybase_uncomp_tot, marker='o', label='happybase-uncompressed total')
+    # plt.plot(data_size, happybase_comp_tot, marker='D', linestyle='--',label='happybase-compressed total')
+    # plt.xlabel("Data size")
+    # plt.ylabel("Time(s)")
+    # plt.legend(loc=2)
+    # plt.show()
+    #
+    # plt.plot(data_size, happybase_uncomp_io, marker='o',
+    #          label='happybase-uncompressed io')
+    # plt.plot(data_size, happybase_comp_io, marker='D', linestyle='--',
+    #          label='happybase-compressed io')
+    # plt.xlabel("Data size")
+    # plt.ylabel("Time(s)")
+    # plt.legend(loc=2)
+    # plt.show()
+    #
+    # plt.plot(data_size, happybase_uncomp_cpu, marker='o',
+    #          label='happybase-uncompressed cpu')
+    # plt.plot(data_size, happybase_comp_cpu, marker='D', linestyle='--',
+    #          label='happybase-compressed cpu')
+    #
+    # plt.xlabel("Data size")
+    # plt.ylabel("Time(s)")
+    # plt.legend(loc=2)
+    # plt.show()
+
+
+if __name__ == '__main__':
+    # anal_ILSVRC()
+    # anal_ILSVRC_Test()
+    # anal_0000()
+    # print timeit.timeit("anal_ILSVRC()", setup="from __main__ import anal_ILSVRC", number=1)
+
+
+    # pre_crop()
+    # plot_line()
+    anal_0000()
+    pass
...	...	@@ -0,0 +1,380 @@
	1	+__author__ = 'chunk'
	2	+
	3	+import os
	4	+import numpy as np
	5	+from numpy.random import randn
	6	+import pandas as pd
	7	+from scipy import stats
	8	+import matplotlib as mpl
	9	+import matplotlib.pyplot as plt
	10	+import seaborn as sns
	11	+
	12	+import numpy as np
	13	+import matplotlib.pyplot as plt
	14	+import seaborn as sns
	15	+from .. import mjpeg
	16	+from ..mjpeg import base
	17	+from ..msteg.steganography import LSB, F3, F4, F5
	18	+
	19	+np.random.seed(sum(map(ord, "whoami")))
	20	+
	21	+sample_key = [46812L, 20559L, 31360L, 16681L, 27536L, 39553L, 5427L, 63029L, 56572L, 36476L, 25695L,
	22	+ 61908L, 63014L, 5908L, 59816L, 56765L]
	23	+
	24	+# plt.ticklabel_format(style='sci', axis='both', scilimits=(0, 0))
	25	+plt.ticklabel_format(style='sci', axis='both')
	26	+
	27	+package_dir = os.path.dirname(os.path.abspath(__file__))
	28	+
	29	+
	30	+def anal_ILSVRC():
	31	+ df_ILS = pd.read_csv('../res/file-tag.tsv',
	32	+ names=['hash', 'width', 'height', 'size', 'quality'], sep='\t')
	33	+ print df_ILS[df_ILS.size < 2000000]
	34	+ print df_ILS.describe()
	35	+ # df_ILS.boxplot(column='size')
	36	+ # plt.show()
	37	+
	38	+ length = df_ILS.shape[0]
	39	+
	40	+ # print type(df_ILS.size.order()) # <class 'pandas.core.series.Series'>
	41	+ print df_ILS.size.order().iloc[map(lambda x: x * length, [1.0 / 3, 2.0 / 3, 0.9999])]
	42	+ """
	43	+ 7082 108514
	44	+ 3826 150389
	45	+ 8761 4814541
	46	+ """
	47	+
	48	+ print df_ILS.size[df_ILS.size <= 102400].count()
	49	+ print df_ILS.size[(df_ILS['size'] > 102400) & (df_ILS['size'] <= 153600)].count()
	50	+ print df_ILS.size[df_ILS.size > 153600].count()
	51	+
	52	+ """
	53	+ (-,100K,150K,+):
	54	+ 4519
	55	+ 6163
	56	+ 4831
	57	+ (-,100K,500K,+):
	58	+ 4519
	59	+ 10932
	60	+ 62
	61	+ """
	62	+
	63	+ ## Quality
	64	+ print df_ILS.quality.order().iloc[map(lambda x: x * length, [1.0 / 3, 2.0 / 3, 0.9999])]
	65	+ """
	66	+ 13507 96
	67	+ 831 96
	68	+ 6529 100
	69	+ """
	70	+ df_new = df_ILS.sort(['size', 'quality'], ascending=True)
	71	+ print df_new
	72	+
	73	+ rand_class = stats.bernoulli.rvs(0.3, size=length)
	74	+ # df_new['class'] = pd.Series(rand_class, index=df_new.index)
	75	+ df_new['class'] = rand_class
	76	+
	77	+ print rand_class[:100]
	78	+ print df_new
	79	+
	80	+ df_new.to_csv('../res/test.tsv', header=False, index=False, sep='\t')
	81	+
	82	+
	83	+def anal_ILSVRC_Test():
	84	+ df_ILS_T = pd.read_csv('../res/file-tag-test.tsv',
	85	+ names=['hash', 'width', 'height', 'size', 'quality', 'class'], sep='\t')
	86	+ print df_ILS_T
	87	+ print df_ILS_T.size.describe()
	88	+
	89	+ print df_ILS_T.size[df_ILS_T.size <= 102400].count()
	90	+ print df_ILS_T.size[(df_ILS_T['size'] > 102400) & (df_ILS_T['size'] <= 153600)].count()
	91	+ print df_ILS_T.size[df_ILS_T.size > 153600].count()
	92	+
	93	+ length = df_ILS_T.shape[0]
	94	+ df_ILS_T['class2'] = np.zeros(length, np.int32)
	95	+ df_ILS_T.to_csv('../res/file-tag-test.tsv', header=False, index=False, sep='\t')
	96	+
	97	+
	98	+def anal_0000():
	99	+ df_ILS = pd.read_csv(os.path.join(package_dir, '../res/file-tag-test.tsv'),
	100	+ names=['hash', 'width', 'height', 'size', 'quality', 'chosen', 'class'],
	101	+ sep='\t')
	102	+ length = df_ILS.shape[0]
	103	+ print df_ILS.size.describe()
	104	+ print df_ILS.size.order().iloc[map(lambda x: x * length, [1.0 / 3, 2.0 / 3, 0.9999])]
	105	+
	106	+ print df_ILS.size[df_ILS.size == 166500].count() / 4592.0
	107	+ print df_ILS.size[df_ILS.size == 187500].count() / 4592.0
	108	+ print df_ILS.size[df_ILS.size == 250000].count() / 4592.0
	109	+
	110	+ print df_ILS.size[df_ILS.size <= 166500].count()
	111	+ print df_ILS.size[(df_ILS['size'] > 166500) & (df_ILS['size'] <= 187500)].count()
	112	+ print df_ILS.size[df_ILS.size > 187500].count()
	113	+
	114	+ plt.ticklabel_format(style='sci', axis='both')
	115	+ df_ILS.hist(column='size', bins=100)
	116	+ plt.title('')
	117	+ plt.xlabel("Image size")
	118	+ plt.ylabel("Frequency")
	119	+ plt.show()
	120	+
	121	+
	122	+def pre_crop():
	123	+ df_ILS = pd.read_csv(os.path.join(package_dir, '../res/file-tag-test.tsv'),
	124	+ names=['hash', 'width', 'height', 'size', 'quality', 'chosen', 'class'],
	125	+ sep='\t')
	126	+ print df_ILS.shape
	127	+ print df_ILS[(df_ILS['width'] >= 300) & (df_ILS['height'] >= 300)].shape
	128	+
	129	+ # 300x300 4213 0.917 *
	130	+ # 200x200 4534 0.987
	131	+ # 400x400 932 0.202
	132	+
	133	+
	134	+def plot_hist():
	135	+ dat_performance = np.array([
	136	+ [100, 0.583396, 30.847788, 57.884814, 89.315998, 1.471087, 29.364628, 9.114235, 10.585322,
	137	+ 39.94995, 2.235697366],
	138	+ [200, 1.147411, 62.815709, 118.217859, 182.180979, 3.008692, 37.920278, 19.589578, 22.59827,
	139	+ 60.518548, 3.010332948],
	140	+ [500, 2.763806, 162.806317, 299.778606, 465.348729, 6.81705, 88.291989, 73.446282,
	141	+ 80.263332, 168.555321, 2.760807112],
	142	+ [1000, 6.372794, 329.023151, 600.438977, 935.834922, 15.644418, 159.951099, 186.335413,
	143	+ 201.979831, 361.93093, 2.585672692],
	144	+ [2000, 14.960961, 679.357936, 1256.341536, 1950.660433, 31.699596, 313.154748, 387.063702,
	145	+ 418.763298, 731.918046, 2.665135043],
	146	+ [5000, 39.880657, 1652.537536, 3067.98039, 4760.398583, 73.070203, 694.454719, 898.458633,
	147	+ 971.528836, 1665.983555, 2.857410308]])
	148	+
	149	+ dat_performance = np.transpose(dat_performance)
	150	+ data_size, serial_tot, spark_io, spark_proc, spark_tot = dat_performance[0], dat_performance[4], \
	151	+ dat_performance[8], dat_performance[6], \
	152	+ dat_performance[9]
	153	+
	154	+ data_size = data_size.astype(int)
	155	+ A = [spark_io, spark_proc]
	156	+ E = np.arange(len(data_size))
	157	+ bar_width = 0.5
	158	+ # plt.bar(E, spark_io, width=bar_width)
	159	+ # plt.bar(E, spark_proc, color='#e74c3c', width=bar_width, bottom=spark_io)
	160	+ # plt.xlabel("Data size")
	161	+ # plt.ylabel("Time(s)")
	162	+ # plt.xticks(E + bar_width / 2, data_size)
	163	+ # # plt.xticks(range(len(data_size)), data_size, size='small')
	164	+ # # plt.ylim(ymax=300000)
	165	+ # plt.show()
	166	+
	167	+ # mpl.rcParams.update({'font.size': 5})
	168	+
	169	+ fig, ax = plt.subplots()
	170	+ rects1 = ax.bar(E, spark_io, bar_width)
	171	+ rects2 = ax.bar(E, spark_proc, bar_width, color='#e74c3c', bottom=spark_io)
	172	+
	173	+ # add some text for labels, title and axes ticks
	174	+ plt.xlabel("Data size")
	175	+ ax.set_ylabel('Time(s)')
	176	+ # ax.set_title('IO ratio')
	177	+ ax.set_xticks(E + bar_width / 2)
	178	+ ax.set_xticklabels(data_size)
	179	+
	180	+ ax.legend((rects1[0], rects2[0]), ('IO', 'CPU'), loc=2)
	181	+
	182	+ height1 = [rect.get_height() for rect in rects1]
	183	+ height2 = [rect.get_height() for rect in rects2]
	184	+ for i in range(len(rects1)):
	185	+ height = rects1[i].get_height() + rects2[i].get_height()
	186	+ ax.text(rects1[i].get_x() + rects1[i].get_width() / 2, 1.005 * height, '%d%%' %
	187	+ int((100 * 1.0*height1[i]/height)),
	188	+ ha='center', va='bottom')
	189	+
	190	+ # height1 = [rect.get_height() for rect in rects1]
	191	+ # height2 = [rect.get_height() for rect in rects2]
	192	+ # for i in range(len(rects1)):
	193	+ # ax.text(rects1[i].get_x() + rects1[i].get_width() / 2, 0.5 * height1[i], '%f' % (0.1 *
	194	+ # height1[
	195	+ # i] /
	196	+ # height2[
	197	+ # i]),
	198	+ # ha='center', va='bottom')
	199	+
	200	+
	201	+ plt.show()
	202	+
	203	+
	204	+def plot_line_performance():
	205	+ # performance
	206	+ dat_performance = np.array([
	207	+ [100, 0.583396, 30.847788, 57.884814, 89.315998, 1.471087, 29.364628, 9.114235, 10.585322,
	208	+ 39.94995, 2.235697366],
	209	+ [200, 1.147411, 62.815709, 118.217859, 182.180979, 3.008692, 37.920278, 19.589578, 22.59827,
	210	+ 60.518548, 3.010332948],
	211	+ [500, 2.763806, 162.806317, 299.778606, 465.348729, 6.81705, 88.291989, 73.446282,
	212	+ 80.263332, 168.555321, 2.760807112],
	213	+ [1000, 6.372794, 329.023151, 600.438977, 935.834922, 15.644418, 159.951099, 186.335413,
	214	+ 201.979831, 361.93093, 2.585672692],
	215	+ [2000, 14.960961, 679.357936, 1256.341536, 1950.660433, 31.699596, 313.154748, 387.063702,
	216	+ 418.763298, 731.918046, 2.665135043],
	217	+ [5000, 39.880657, 1652.537536, 3067.98039, 4760.398583, 73.070203, 694.454719, 898.458633,
	218	+ 971.528836, 1665.983555, 2.857410308]])
	219	+
	220	+ dat_performance = np.transpose(dat_performance)
	221	+ data_size, serial_tot, spark_io, spark_proc, spark_tot = dat_performance[0], dat_performance[4], \
	222	+ dat_performance[8], dat_performance[6], \
	223	+ dat_performance[9]
	224	+
	225	+ # sns.set_style("white")
	226	+ # data_size = data_size.astype(int)
	227	+ # plt.plot(range(len(data_size)), serial_tot, marker='o', label='serial total')
	228	+ # plt.plot(range(len(data_size)), spark_tot, marker='o', linestyle='--', label='spark total')
	229	+ # plt.plot(range(len(data_size)), spark_io, marker='o', linestyle=':', label='spark io')
	230	+ # plt.plot(range(len(data_size)), spark_proc, marker='o', linestyle='-.', label='spark proc')
	231	+ # plt.xlabel("Data size")
	232	+ # plt.ylabel("Time(s)")
	233	+ # plt.xticks(range(len(data_size)), data_size, size='small')
	234	+ # plt.legend(loc=2)
	235	+ # plt.show()
	236	+
	237	+ plt.plot(data_size, serial_tot, marker='o', label='serial total')
	238	+ plt.plot(data_size, spark_tot, marker='o', linestyle='--', label='spark total')
	239	+ plt.plot(data_size, spark_io, marker='o', linestyle=':', label='spark io')
	240	+ plt.plot(data_size, spark_proc, marker='o', linestyle='-.', label='spark proc')
	241	+ plt.xlabel("Data size")
	242	+ plt.ylabel("Time(s)")
	243	+ plt.legend(loc=2)
	244	+ plt.show()
	245	+
	246	+
	247	+def plot_line_io():
	248	+ # io
	249	+ dat_io = np.array([
	250	+ [100, 10.585322, 29.364628, 39.94995, 10.286684, 27.079774, 37.366458, 49.995647,
	251	+ 55.280739],
	252	+ [200, 22.59827, 37.920278, 60.518548, 22.731275, 38.491461, 61.222736, 76.258928,
	253	+ 83.836657],
	254	+ [500, 80.263332, 88.291989, 168.555321, 64.610839, 88.241193, 152.852032, 177.039349,
	255	+ 143.524813],
	256	+ [1000, 201.979831, 159.951099, 361.93093, 172.359455, 158.694248, 331.053703, 467.126756,
	257	+ 315.578952],
	258	+ [2000, 418.763298, 313.154748, 731.918046, 390.990209, 313.085707, 704.075916, 802.138669,
	259	+ 734.133909],
	260	+ [5000, 971.528836, 694.454719, 1665.983555, 898.468232, 717.603061, 1616.071293,
	261	+ 1860.610954, 1677.044038]])
	262	+
	263	+ dat_io = np.transpose(dat_io)
	264	+ data_size, happybase_uncomp_io, happybase_uncomp_cpu, happybase_uncomp_tot, happybase_comp_io, happybase_comp_cpu, happybase_comp_tot, dist_uncomp, dist_comp = dat_io
	265	+ # data_size = data_size.astype(int)
	266	+ # plt.plot(range(len(data_size)), dist_uncomp, marker='o', label='dist-uncompressed total')
	267	+ # plt.plot(range(len(data_size)), dist_comp, marker='o', label='dist-compressed total')
	268	+ # plt.plot(range(len(data_size)), happybase_uncomp_tot, marker='o', label='happybase-uncompressed total')
	269	+ # plt.plot(range(len(data_size)), happybase_comp_tot, marker='o', label='happybase-compressed total')
	270	+ #
	271	+ # plt.plot(range(len(data_size)), happybase_uncomp_io, marker='o', linestyle='--',
	272	+ # label='happybase-uncompressed io')
	273	+ # plt.plot(range(len(data_size)), happybase_comp_io, marker='o', linestyle='--',
	274	+ # label='happybase-compressed io')
	275	+ # plt.plot(range(len(data_size)), happybase_uncomp_cpu, marker='o', linestyle='--',
	276	+ # label='happybase-uncompressed cpu')
	277	+ # plt.plot(range(len(data_size)), happybase_comp_cpu, marker='o', linestyle='--',
	278	+ # label='happybase-compressed cpu')
	279	+ #
	280	+ # plt.xlabel("Data size")
	281	+ # plt.ylabel("Time")
	282	+ # plt.xticks(range(len(data_size)), data_size, size='small')
	283	+ # plt.legend(loc=2)
	284	+ # plt.show()
	285	+
	286	+ plt.plot(data_size, dist_uncomp, marker='o', label='dist-uncompressed total')
	287	+ plt.plot(data_size, dist_comp, marker='D', label='dist-compressed total')
	288	+ plt.plot(data_size, happybase_uncomp_tot, marker='o', label='happybase-uncompressed total')
	289	+ plt.plot(data_size, happybase_comp_tot, marker='D', label='happybase-compressed total')
	290	+
	291	+ plt.plot(data_size, happybase_uncomp_io, marker='o', linestyle='--',
	292	+ label='happybase-uncompressed io')
	293	+ plt.plot(data_size, happybase_comp_io, marker='D', linestyle='--',
	294	+ label='happybase-compressed io')
	295	+ plt.plot(data_size, happybase_uncomp_cpu, marker='o', linestyle='--',
	296	+ label='happybase-uncompressed cpu')
	297	+ plt.plot(data_size, happybase_comp_cpu, marker='D', linestyle='--',
	298	+ label='happybase-compressed cpu')
	299	+
	300	+ plt.xlabel("Data size")
	301	+ plt.ylabel("Time")
	302	+ plt.legend(loc=2)
	303	+ plt.show()
	304	+
	305	+
	306	+ # plt.subplot(2, 2, 1)
	307	+ # plt.plot(data_size, dist_uncomp, marker='o', label='dist-uncompressed total')
	308	+ # plt.plot(data_size, dist_comp, marker='o', label='dist-compressed total')
	309	+ # # plt.title('Performance with(out) Compression')
	310	+ # plt.ylabel("Time")
	311	+ # plt.legend(loc=2)
	312	+ #
	313	+ # plt.subplot(2, 2, 2)
	314	+ # plt.plot(data_size, happybase_uncomp_tot, marker='o', label='happybase-uncompressed total')
	315	+ # plt.plot(data_size, happybase_comp_tot, marker='o', label='happybase-compressed total')
	316	+ # plt.legend(loc=2)
	317	+ #
	318	+ # plt.subplot(2, 2, 3)
	319	+ # plt.plot(data_size, happybase_uncomp_io, marker='o', linestyle='--',
	320	+ # label='happybase-uncompressed io')
	321	+ # plt.plot(data_size, happybase_comp_io, marker='o', linestyle='--',
	322	+ # label='happybase-compressed io')
	323	+ # plt.ylabel("Time")
	324	+ # plt.xlabel("Data size")
	325	+ # plt.legend(loc=2)
	326	+ #
	327	+ # plt.subplot(2, 2, 4)
	328	+ # plt.plot(data_size, happybase_uncomp_cpu, marker='o', linestyle='--',
	329	+ # label='happybase-uncompressed cpu')
	330	+ # plt.plot(data_size, happybase_comp_cpu, marker='o', linestyle='--',
	331	+ # label='happybase-compressed cpu')
	332	+ # plt.xlabel("Data size")
	333	+ # plt.legend(loc=2)
	334	+ # plt.show()
	335	+
	336	+ # plt.plot(data_size, dist_uncomp, marker='o', label='dist-uncompressed total')
	337	+ # plt.plot(data_size, dist_comp, marker='D', linestyle='--',label='dist-compressed total')
	338	+ # plt.xlabel("Data size")
	339	+ # plt.ylabel("Time(s)")
	340	+ # plt.legend(loc=2)
	341	+ # plt.show()
	342	+ #
	343	+ # plt.plot(data_size, happybase_uncomp_tot, marker='o', label='happybase-uncompressed total')
	344	+ # plt.plot(data_size, happybase_comp_tot, marker='D', linestyle='--',label='happybase-compressed total')
	345	+ # plt.xlabel("Data size")
	346	+ # plt.ylabel("Time(s)")
	347	+ # plt.legend(loc=2)
	348	+ # plt.show()
	349	+ #
	350	+ # plt.plot(data_size, happybase_uncomp_io, marker='o',
	351	+ # label='happybase-uncompressed io')
	352	+ # plt.plot(data_size, happybase_comp_io, marker='D', linestyle='--',
	353	+ # label='happybase-compressed io')
	354	+ # plt.xlabel("Data size")
	355	+ # plt.ylabel("Time(s)")
	356	+ # plt.legend(loc=2)
	357	+ # plt.show()
	358	+ #
	359	+ # plt.plot(data_size, happybase_uncomp_cpu, marker='o',
	360	+ # label='happybase-uncompressed cpu')
	361	+ # plt.plot(data_size, happybase_comp_cpu, marker='D', linestyle='--',
	362	+ # label='happybase-compressed cpu')
	363	+ #
	364	+ # plt.xlabel("Data size")
	365	+ # plt.ylabel("Time(s)")
	366	+ # plt.legend(loc=2)
	367	+ # plt.show()
	368	+
	369	+
	370	+if __name__ == '__main__':
	371	+ # anal_ILSVRC()
	372	+ # anal_ILSVRC_Test()
	373	+ # anal_0000()
	374	+ # print timeit.timeit("anal_ILSVRC()", setup="from __main__ import anal_ILSVRC", number=1)
	375	+
	376	+
	377	+ # pre_crop()
	378	+ # plot_line()
	379	+ anal_0000()
	380	+ pass
...	...