__author__ = 'chunk' import os import numpy as np from numpy.random import randn import pandas as pd from scipy import stats import matplotlib as mpl import matplotlib.pyplot as plt import seaborn as sns import numpy as np import matplotlib.pyplot as plt import seaborn as sns from .. import mjpeg from ..mjpeg import base from ..msteg.steganography import LSB, F3, F4, F5 np.random.seed(sum(map(ord, "whoami"))) sample_key = [46812L, 20559L, 31360L, 16681L, 27536L, 39553L, 5427L, 63029L, 56572L, 36476L, 25695L, 61908L, 63014L, 5908L, 59816L, 56765L] # plt.ticklabel_format(style='sci', axis='both', scilimits=(0, 0)) plt.ticklabel_format(style='sci', axis='both') package_dir = os.path.dirname(os.path.abspath(__file__)) def anal_ILSVRC(): df_ILS = pd.read_csv('../res/file-tag.tsv', names=['hash', 'width', 'height', 'size', 'quality'], sep='\t') print df_ILS[df_ILS.size < 2000000] print df_ILS.describe() # df_ILS.boxplot(column='size') # plt.show() length = df_ILS.shape[0] # print type(df_ILS.size.order()) # print df_ILS.size.order().iloc[map(lambda x: x * length, [1.0 / 3, 2.0 / 3, 0.9999])] """ 7082 108514 3826 150389 8761 4814541 """ print df_ILS.size[df_ILS.size <= 102400].count() print df_ILS.size[(df_ILS['size'] > 102400) & (df_ILS['size'] <= 153600)].count() print df_ILS.size[df_ILS.size > 153600].count() """ (-,100K,150K,+): 4519 6163 4831 (-,100K,500K,+): 4519 10932 62 """ ## Quality print df_ILS.quality.order().iloc[map(lambda x: x * length, [1.0 / 3, 2.0 / 3, 0.9999])] """ 13507 96 831 96 6529 100 """ df_new = df_ILS.sort(['size', 'quality'], ascending=True) print df_new rand_class = stats.bernoulli.rvs(0.3, size=length) # df_new['class'] = pd.Series(rand_class, index=df_new.index) df_new['class'] = rand_class print rand_class[:100] print df_new df_new.to_csv('../res/test.tsv', header=False, index=False, sep='\t') def anal_ILSVRC_Test(): df_ILS_T = pd.read_csv('../res/file-tag-test.tsv', names=['hash', 'width', 'height', 'size', 'quality', 'class'], sep='\t') print df_ILS_T print df_ILS_T.size.describe() print df_ILS_T.size[df_ILS_T.size <= 102400].count() print df_ILS_T.size[(df_ILS_T['size'] > 102400) & (df_ILS_T['size'] <= 153600)].count() print df_ILS_T.size[df_ILS_T.size > 153600].count() length = df_ILS_T.shape[0] df_ILS_T['class2'] = np.zeros(length, np.int32) df_ILS_T.to_csv('../res/file-tag-test.tsv', header=False, index=False, sep='\t') def anal_0000(): df_ILS = pd.read_csv(os.path.join(package_dir, '../res/file-tag-test.tsv'), names=['hash', 'width', 'height', 'size', 'quality', 'chosen', 'class'], sep='\t') length = df_ILS.shape[0] print df_ILS.size.describe() print df_ILS.size.order().iloc[map(lambda x: x * length, [1.0 / 3, 2.0 / 3, 0.9999])] print df_ILS.size[df_ILS.size == 166500].count() / 4592.0 print df_ILS.size[df_ILS.size == 187500].count() / 4592.0 print df_ILS.size[df_ILS.size == 250000].count() / 4592.0 print df_ILS.size[df_ILS.size <= 166500].count() print df_ILS.size[(df_ILS['size'] > 166500) & (df_ILS['size'] <= 187500)].count() print df_ILS.size[df_ILS.size > 187500].count() plt.ticklabel_format(style='sci', axis='both') df_ILS.hist(column='size', bins=100) plt.title('') plt.xlabel("Image size") plt.ylabel("Frequency") plt.show() def pre_crop(): df_ILS = pd.read_csv(os.path.join(package_dir, '../res/file-tag-test.tsv'), names=['hash', 'width', 'height', 'size', 'quality', 'chosen', 'class'], sep='\t') print df_ILS.shape print df_ILS[(df_ILS['width'] >= 300) & (df_ILS['height'] >= 300)].shape # 300x300 4213 0.917 * # 200x200 4534 0.987 # 400x400 932 0.202 def plot_hist(): dat_performance = np.array([ [100, 0.583396, 30.847788, 57.884814, 89.315998, 1.471087, 29.364628, 9.114235, 10.585322, 39.94995, 2.235697366], [200, 1.147411, 62.815709, 118.217859, 182.180979, 3.008692, 37.920278, 19.589578, 22.59827, 60.518548, 3.010332948], [500, 2.763806, 162.806317, 299.778606, 465.348729, 6.81705, 88.291989, 73.446282, 80.263332, 168.555321, 2.760807112], [1000, 6.372794, 329.023151, 600.438977, 935.834922, 15.644418, 159.951099, 186.335413, 201.979831, 361.93093, 2.585672692], [2000, 14.960961, 679.357936, 1256.341536, 1950.660433, 31.699596, 313.154748, 387.063702, 418.763298, 731.918046, 2.665135043], [5000, 39.880657, 1652.537536, 3067.98039, 4760.398583, 73.070203, 694.454719, 898.458633, 971.528836, 1665.983555, 2.857410308]]) dat_performance = np.transpose(dat_performance) data_size, serial_tot, spark_io, spark_proc, spark_tot = dat_performance[0], dat_performance[4], \ dat_performance[8], dat_performance[6], \ dat_performance[9] data_size = data_size.astype(int) A = [spark_io, spark_proc] E = np.arange(len(data_size)) bar_width = 0.5 # plt.bar(E, spark_io, width=bar_width) # plt.bar(E, spark_proc, color='#e74c3c', width=bar_width, bottom=spark_io) # plt.xlabel("Data size") # plt.ylabel("Time(s)") # plt.xticks(E + bar_width / 2, data_size) # # plt.xticks(range(len(data_size)), data_size, size='small') # # plt.ylim(ymax=300000) # plt.show() # mpl.rcParams.update({'font.size': 5}) fig, ax = plt.subplots() rects1 = ax.bar(E, spark_io, bar_width) rects2 = ax.bar(E, spark_proc, bar_width, color='#e74c3c', bottom=spark_io) # add some text for labels, title and axes ticks plt.xlabel("Data size") ax.set_ylabel('Time(s)') # ax.set_title('IO ratio') ax.set_xticks(E + bar_width / 2) ax.set_xticklabels(data_size) ax.legend((rects1[0], rects2[0]), ('IO', 'CPU'), loc=2) height1 = [rect.get_height() for rect in rects1] height2 = [rect.get_height() for rect in rects2] for i in range(len(rects1)): height = rects1[i].get_height() + rects2[i].get_height() ax.text(rects1[i].get_x() + rects1[i].get_width() / 2, 1.005 * height, '%d%%' % int((100 * 1.0*height1[i]/height)), ha='center', va='bottom') # height1 = [rect.get_height() for rect in rects1] # height2 = [rect.get_height() for rect in rects2] # for i in range(len(rects1)): # ax.text(rects1[i].get_x() + rects1[i].get_width() / 2, 0.5 * height1[i], '%f' % (0.1 * # height1[ # i] / # height2[ # i]), # ha='center', va='bottom') plt.show() def plot_line_performance(): # performance dat_performance = np.array([ [100, 0.583396, 30.847788, 57.884814, 89.315998, 1.471087, 29.364628, 9.114235, 10.585322, 39.94995, 2.235697366], [200, 1.147411, 62.815709, 118.217859, 182.180979, 3.008692, 37.920278, 19.589578, 22.59827, 60.518548, 3.010332948], [500, 2.763806, 162.806317, 299.778606, 465.348729, 6.81705, 88.291989, 73.446282, 80.263332, 168.555321, 2.760807112], [1000, 6.372794, 329.023151, 600.438977, 935.834922, 15.644418, 159.951099, 186.335413, 201.979831, 361.93093, 2.585672692], [2000, 14.960961, 679.357936, 1256.341536, 1950.660433, 31.699596, 313.154748, 387.063702, 418.763298, 731.918046, 2.665135043], [5000, 39.880657, 1652.537536, 3067.98039, 4760.398583, 73.070203, 694.454719, 898.458633, 971.528836, 1665.983555, 2.857410308]]) dat_performance = np.transpose(dat_performance) data_size, serial_tot, spark_io, spark_proc, spark_tot = dat_performance[0], dat_performance[4], \ dat_performance[8], dat_performance[6], \ dat_performance[9] # sns.set_style("white") # data_size = data_size.astype(int) # plt.plot(range(len(data_size)), serial_tot, marker='o', label='serial total') # plt.plot(range(len(data_size)), spark_tot, marker='o', linestyle='--', label='spark total') # plt.plot(range(len(data_size)), spark_io, marker='o', linestyle=':', label='spark io') # plt.plot(range(len(data_size)), spark_proc, marker='o', linestyle='-.', label='spark proc') # plt.xlabel("Data size") # plt.ylabel("Time(s)") # plt.xticks(range(len(data_size)), data_size, size='small') # plt.legend(loc=2) # plt.show() plt.plot(data_size, serial_tot, marker='o', label='serial total') plt.plot(data_size, spark_tot, marker='o', linestyle='--', label='spark total') plt.plot(data_size, spark_io, marker='o', linestyle=':', label='spark io') plt.plot(data_size, spark_proc, marker='o', linestyle='-.', label='spark proc') plt.xlabel("Data size") plt.ylabel("Time(s)") plt.legend(loc=2) plt.show() def plot_line_io(): # io dat_io = np.array([ [100, 10.585322, 29.364628, 39.94995, 10.286684, 27.079774, 37.366458, 49.995647, 55.280739], [200, 22.59827, 37.920278, 60.518548, 22.731275, 38.491461, 61.222736, 76.258928, 83.836657], [500, 80.263332, 88.291989, 168.555321, 64.610839, 88.241193, 152.852032, 177.039349, 143.524813], [1000, 201.979831, 159.951099, 361.93093, 172.359455, 158.694248, 331.053703, 467.126756, 315.578952], [2000, 418.763298, 313.154748, 731.918046, 390.990209, 313.085707, 704.075916, 802.138669, 734.133909], [5000, 971.528836, 694.454719, 1665.983555, 898.468232, 717.603061, 1616.071293, 1860.610954, 1677.044038]]) dat_io = np.transpose(dat_io) data_size, happybase_uncomp_io, happybase_uncomp_cpu, happybase_uncomp_tot, happybase_comp_io, happybase_comp_cpu, happybase_comp_tot, dist_uncomp, dist_comp = dat_io # data_size = data_size.astype(int) # plt.plot(range(len(data_size)), dist_uncomp, marker='o', label='dist-uncompressed total') # plt.plot(range(len(data_size)), dist_comp, marker='o', label='dist-compressed total') # plt.plot(range(len(data_size)), happybase_uncomp_tot, marker='o', label='happybase-uncompressed total') # plt.plot(range(len(data_size)), happybase_comp_tot, marker='o', label='happybase-compressed total') # # plt.plot(range(len(data_size)), happybase_uncomp_io, marker='o', linestyle='--', # label='happybase-uncompressed io') # plt.plot(range(len(data_size)), happybase_comp_io, marker='o', linestyle='--', # label='happybase-compressed io') # plt.plot(range(len(data_size)), happybase_uncomp_cpu, marker='o', linestyle='--', # label='happybase-uncompressed cpu') # plt.plot(range(len(data_size)), happybase_comp_cpu, marker='o', linestyle='--', # label='happybase-compressed cpu') # # plt.xlabel("Data size") # plt.ylabel("Time") # plt.xticks(range(len(data_size)), data_size, size='small') # plt.legend(loc=2) # plt.show() plt.plot(data_size, dist_uncomp, marker='o', label='dist-uncompressed total') plt.plot(data_size, dist_comp, marker='D', label='dist-compressed total') plt.plot(data_size, happybase_uncomp_tot, marker='o', label='happybase-uncompressed total') plt.plot(data_size, happybase_comp_tot, marker='D', label='happybase-compressed total') plt.plot(data_size, happybase_uncomp_io, marker='o', linestyle='--', label='happybase-uncompressed io') plt.plot(data_size, happybase_comp_io, marker='D', linestyle='--', label='happybase-compressed io') plt.plot(data_size, happybase_uncomp_cpu, marker='o', linestyle='--', label='happybase-uncompressed cpu') plt.plot(data_size, happybase_comp_cpu, marker='D', linestyle='--', label='happybase-compressed cpu') plt.xlabel("Data size") plt.ylabel("Time") plt.legend(loc=2) plt.show() # plt.subplot(2, 2, 1) # plt.plot(data_size, dist_uncomp, marker='o', label='dist-uncompressed total') # plt.plot(data_size, dist_comp, marker='o', label='dist-compressed total') # # plt.title('Performance with(out) Compression') # plt.ylabel("Time") # plt.legend(loc=2) # # plt.subplot(2, 2, 2) # plt.plot(data_size, happybase_uncomp_tot, marker='o', label='happybase-uncompressed total') # plt.plot(data_size, happybase_comp_tot, marker='o', label='happybase-compressed total') # plt.legend(loc=2) # # plt.subplot(2, 2, 3) # plt.plot(data_size, happybase_uncomp_io, marker='o', linestyle='--', # label='happybase-uncompressed io') # plt.plot(data_size, happybase_comp_io, marker='o', linestyle='--', # label='happybase-compressed io') # plt.ylabel("Time") # plt.xlabel("Data size") # plt.legend(loc=2) # # plt.subplot(2, 2, 4) # plt.plot(data_size, happybase_uncomp_cpu, marker='o', linestyle='--', # label='happybase-uncompressed cpu') # plt.plot(data_size, happybase_comp_cpu, marker='o', linestyle='--', # label='happybase-compressed cpu') # plt.xlabel("Data size") # plt.legend(loc=2) # plt.show() # plt.plot(data_size, dist_uncomp, marker='o', label='dist-uncompressed total') # plt.plot(data_size, dist_comp, marker='D', linestyle='--',label='dist-compressed total') # plt.xlabel("Data size") # plt.ylabel("Time(s)") # plt.legend(loc=2) # plt.show() # # plt.plot(data_size, happybase_uncomp_tot, marker='o', label='happybase-uncompressed total') # plt.plot(data_size, happybase_comp_tot, marker='D', linestyle='--',label='happybase-compressed total') # plt.xlabel("Data size") # plt.ylabel("Time(s)") # plt.legend(loc=2) # plt.show() # # plt.plot(data_size, happybase_uncomp_io, marker='o', # label='happybase-uncompressed io') # plt.plot(data_size, happybase_comp_io, marker='D', linestyle='--', # label='happybase-compressed io') # plt.xlabel("Data size") # plt.ylabel("Time(s)") # plt.legend(loc=2) # plt.show() # # plt.plot(data_size, happybase_uncomp_cpu, marker='o', # label='happybase-uncompressed cpu') # plt.plot(data_size, happybase_comp_cpu, marker='D', linestyle='--', # label='happybase-compressed cpu') # # plt.xlabel("Data size") # plt.ylabel("Time(s)") # plt.legend(loc=2) # plt.show() if __name__ == '__main__': # anal_ILSVRC() # anal_ILSVRC_Test() # anal_0000() # print timeit.timeit("anal_ILSVRC()", setup="from __main__ import anal_ILSVRC", number=1) # pre_crop() # plot_line() anal_0000() pass