ANALYSIS.py 15.1 KB
__author__ = 'chunk'

import os
import numpy as np
from numpy.random import randn
import pandas as pd
from scipy import stats
import matplotlib as mpl
import matplotlib.pyplot as plt
import seaborn as sns

import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from .. import mjpeg
from ..mjpeg import base
from ..msteg.steganography import LSB, F3, F4, F5

np.random.seed(sum(map(ord, "whoami")))

sample_key = [46812L, 20559L, 31360L, 16681L, 27536L, 39553L, 5427L, 63029L, 56572L, 36476L, 25695L,
              61908L, 63014L, 5908L, 59816L, 56765L]

# plt.ticklabel_format(style='sci', axis='both', scilimits=(0, 0))
plt.ticklabel_format(style='sci', axis='both')

package_dir = os.path.dirname(os.path.abspath(__file__))


def anal_ILSVRC():
    df_ILS = pd.read_csv('../res/file-tag.tsv',
                         names=['hash', 'width', 'height', 'size', 'quality'], sep='\t')
    print df_ILS[df_ILS.size < 2000000]
    print df_ILS.describe()
    # df_ILS.boxplot(column='size')
    # plt.show()

    length = df_ILS.shape[0]

    # print type(df_ILS.size.order()) # <class 'pandas.core.series.Series'>
    print df_ILS.size.order().iloc[map(lambda x: x * length, [1.0 / 3, 2.0 / 3, 0.9999])]
    """
    7082     108514
    3826     150389
    8761    4814541
    """

    print df_ILS.size[df_ILS.size <= 102400].count()
    print df_ILS.size[(df_ILS['size'] > 102400) & (df_ILS['size'] <= 153600)].count()
    print df_ILS.size[df_ILS.size > 153600].count()

    """
    (-,100K,150K,+):
        4519
        6163
        4831
    (-,100K,500K,+):
        4519
        10932
        62
    """

    ## Quality
    print df_ILS.quality.order().iloc[map(lambda x: x * length, [1.0 / 3, 2.0 / 3, 0.9999])]
    """
    13507     96
    831       96
    6529     100
    """
    df_new = df_ILS.sort(['size', 'quality'], ascending=True)
    print df_new

    rand_class = stats.bernoulli.rvs(0.3, size=length)
    # df_new['class'] = pd.Series(rand_class, index=df_new.index)
    df_new['class'] = rand_class

    print rand_class[:100]
    print df_new

    df_new.to_csv('../res/test.tsv', header=False, index=False, sep='\t')


def anal_ILSVRC_Test():
    df_ILS_T = pd.read_csv('../res/file-tag-test.tsv',
                           names=['hash', 'width', 'height', 'size', 'quality', 'class'], sep='\t')
    print df_ILS_T
    print df_ILS_T.size.describe()

    print df_ILS_T.size[df_ILS_T.size <= 102400].count()
    print df_ILS_T.size[(df_ILS_T['size'] > 102400) & (df_ILS_T['size'] <= 153600)].count()
    print df_ILS_T.size[df_ILS_T.size > 153600].count()

    length = df_ILS_T.shape[0]
    df_ILS_T['class2'] = np.zeros(length, np.int32)
    df_ILS_T.to_csv('../res/file-tag-test.tsv', header=False, index=False, sep='\t')


def anal_0000():
    df_ILS = pd.read_csv(os.path.join(package_dir, '../res/file-tag-test.tsv'),
                         names=['hash', 'width', 'height', 'size', 'quality', 'chosen', 'class'],
                         sep='\t')
    length = df_ILS.shape[0]
    print df_ILS.size.describe()
    print df_ILS.size.order().iloc[map(lambda x: x * length, [1.0 / 3, 2.0 / 3, 0.9999])]

    print df_ILS.size[df_ILS.size == 166500].count() / 4592.0
    print df_ILS.size[df_ILS.size == 187500].count() / 4592.0
    print df_ILS.size[df_ILS.size == 250000].count() / 4592.0

    print df_ILS.size[df_ILS.size <= 166500].count()
    print df_ILS.size[(df_ILS['size'] > 166500) & (df_ILS['size'] <= 187500)].count()
    print df_ILS.size[df_ILS.size > 187500].count()

    plt.ticklabel_format(style='sci', axis='both')
    df_ILS.hist(column='size', bins=100)
    plt.title('')
    plt.xlabel("Image size")
    plt.ylabel("Frequency")
    plt.show()


def pre_crop():
    df_ILS = pd.read_csv(os.path.join(package_dir, '../res/file-tag-test.tsv'),
                         names=['hash', 'width', 'height', 'size', 'quality', 'chosen', 'class'],
                         sep='\t')
    print df_ILS.shape
    print df_ILS[(df_ILS['width'] >= 300) & (df_ILS['height'] >= 300)].shape

    # 300x300 4213 0.917 *
    # 200x200 4534 0.987
    # 400x400 932 0.202


def plot_hist():
    dat_performance = np.array([
        [100, 0.583396, 30.847788, 57.884814, 89.315998, 1.471087, 29.364628, 9.114235, 10.585322,
         39.94995, 2.235697366],
        [200, 1.147411, 62.815709, 118.217859, 182.180979, 3.008692, 37.920278, 19.589578, 22.59827,
         60.518548, 3.010332948],
        [500, 2.763806, 162.806317, 299.778606, 465.348729, 6.81705, 88.291989, 73.446282,
         80.263332, 168.555321, 2.760807112],
        [1000, 6.372794, 329.023151, 600.438977, 935.834922, 15.644418, 159.951099, 186.335413,
         201.979831, 361.93093, 2.585672692],
        [2000, 14.960961, 679.357936, 1256.341536, 1950.660433, 31.699596, 313.154748, 387.063702,
         418.763298, 731.918046, 2.665135043],
        [5000, 39.880657, 1652.537536, 3067.98039, 4760.398583, 73.070203, 694.454719, 898.458633,
         971.528836, 1665.983555, 2.857410308]])

    dat_performance = np.transpose(dat_performance)
    data_size, serial_tot, spark_io, spark_proc, spark_tot = dat_performance[0], dat_performance[4], \
                                                             dat_performance[8], dat_performance[6], \
                                                             dat_performance[9]

    data_size = data_size.astype(int)
    A = [spark_io, spark_proc]
    E = np.arange(len(data_size))
    bar_width = 0.5
    # plt.bar(E, spark_io, width=bar_width)
    # plt.bar(E, spark_proc, color='#e74c3c', width=bar_width, bottom=spark_io)
    # plt.xlabel("Data size")
    # plt.ylabel("Time(s)")
    # plt.xticks(E + bar_width / 2, data_size)
    # # plt.xticks(range(len(data_size)), data_size, size='small')
    # # plt.ylim(ymax=300000)
    # plt.show()

    # mpl.rcParams.update({'font.size': 5})

    fig, ax = plt.subplots()
    rects1 = ax.bar(E, spark_io, bar_width)
    rects2 = ax.bar(E, spark_proc, bar_width, color='#e74c3c', bottom=spark_io)

    # add some text for labels, title and axes ticks
    plt.xlabel("Data size")
    ax.set_ylabel('Time(s)')
    # ax.set_title('IO ratio')
    ax.set_xticks(E + bar_width / 2)
    ax.set_xticklabels(data_size)

    ax.legend((rects1[0], rects2[0]), ('IO', 'CPU'), loc=2)

    height1 = [rect.get_height() for rect in rects1]
    height2 = [rect.get_height() for rect in rects2]
    for i in range(len(rects1)):
        height = rects1[i].get_height() + rects2[i].get_height()
        ax.text(rects1[i].get_x() + rects1[i].get_width() / 2, 1.005 * height, '%d%%' %
                int((100 * 1.0*height1[i]/height)),
                ha='center', va='bottom')

    # height1 = [rect.get_height() for rect in rects1]
    # height2 = [rect.get_height() for rect in rects2]
    # for i in range(len(rects1)):
    #     ax.text(rects1[i].get_x() + rects1[i].get_width() / 2, 0.5 * height1[i], '%f' % (0.1 *
    #                                                                                      height1[
    #                                                                                          i] /
    #                                                                                      height2[
    #                                                                                          i]),
    #             ha='center', va='bottom')


    plt.show()


def plot_line_performance():
    # performance
    dat_performance = np.array([
        [100, 0.583396, 30.847788, 57.884814, 89.315998, 1.471087, 29.364628, 9.114235, 10.585322,
         39.94995, 2.235697366],
        [200, 1.147411, 62.815709, 118.217859, 182.180979, 3.008692, 37.920278, 19.589578, 22.59827,
         60.518548, 3.010332948],
        [500, 2.763806, 162.806317, 299.778606, 465.348729, 6.81705, 88.291989, 73.446282,
         80.263332, 168.555321, 2.760807112],
        [1000, 6.372794, 329.023151, 600.438977, 935.834922, 15.644418, 159.951099, 186.335413,
         201.979831, 361.93093, 2.585672692],
        [2000, 14.960961, 679.357936, 1256.341536, 1950.660433, 31.699596, 313.154748, 387.063702,
         418.763298, 731.918046, 2.665135043],
        [5000, 39.880657, 1652.537536, 3067.98039, 4760.398583, 73.070203, 694.454719, 898.458633,
         971.528836, 1665.983555, 2.857410308]])

    dat_performance = np.transpose(dat_performance)
    data_size, serial_tot, spark_io, spark_proc, spark_tot = dat_performance[0], dat_performance[4], \
                                                             dat_performance[8], dat_performance[6], \
                                                             dat_performance[9]

    # sns.set_style("white")
    # data_size = data_size.astype(int)
    # plt.plot(range(len(data_size)), serial_tot, marker='o', label='serial total')
    # plt.plot(range(len(data_size)), spark_tot, marker='o', linestyle='--', label='spark total')
    # plt.plot(range(len(data_size)), spark_io, marker='o', linestyle=':', label='spark io')
    # plt.plot(range(len(data_size)), spark_proc, marker='o', linestyle='-.', label='spark proc')
    # plt.xlabel("Data size")
    # plt.ylabel("Time(s)")
    # plt.xticks(range(len(data_size)), data_size, size='small')
    # plt.legend(loc=2)
    # plt.show()

    plt.plot(data_size, serial_tot, marker='o', label='serial total')
    plt.plot(data_size, spark_tot, marker='o', linestyle='--', label='spark total')
    plt.plot(data_size, spark_io, marker='o', linestyle=':', label='spark io')
    plt.plot(data_size, spark_proc, marker='o', linestyle='-.', label='spark proc')
    plt.xlabel("Data size")
    plt.ylabel("Time(s)")
    plt.legend(loc=2)
    plt.show()


def plot_line_io():
    # io
    dat_io = np.array([
        [100, 10.585322, 29.364628, 39.94995, 10.286684, 27.079774, 37.366458, 49.995647,
         55.280739],
        [200, 22.59827, 37.920278, 60.518548, 22.731275, 38.491461, 61.222736, 76.258928,
         83.836657],
        [500, 80.263332, 88.291989, 168.555321, 64.610839, 88.241193, 152.852032, 177.039349,
         143.524813],
        [1000, 201.979831, 159.951099, 361.93093, 172.359455, 158.694248, 331.053703, 467.126756,
         315.578952],
        [2000, 418.763298, 313.154748, 731.918046, 390.990209, 313.085707, 704.075916, 802.138669,
         734.133909],
        [5000, 971.528836, 694.454719, 1665.983555, 898.468232, 717.603061, 1616.071293,
         1860.610954, 1677.044038]])

    dat_io = np.transpose(dat_io)
    data_size, happybase_uncomp_io, happybase_uncomp_cpu, happybase_uncomp_tot, happybase_comp_io, happybase_comp_cpu, happybase_comp_tot, dist_uncomp, dist_comp = dat_io
    # data_size = data_size.astype(int)
    # plt.plot(range(len(data_size)), dist_uncomp, marker='o', label='dist-uncompressed total')
    # plt.plot(range(len(data_size)), dist_comp, marker='o', label='dist-compressed total')
    # plt.plot(range(len(data_size)), happybase_uncomp_tot, marker='o', label='happybase-uncompressed total')
    # plt.plot(range(len(data_size)), happybase_comp_tot, marker='o', label='happybase-compressed total')
    #
    # plt.plot(range(len(data_size)), happybase_uncomp_io, marker='o', linestyle='--',
    #          label='happybase-uncompressed io')
    # plt.plot(range(len(data_size)), happybase_comp_io, marker='o', linestyle='--',
    #          label='happybase-compressed io')
    # plt.plot(range(len(data_size)), happybase_uncomp_cpu, marker='o', linestyle='--',
    #          label='happybase-uncompressed cpu')
    # plt.plot(range(len(data_size)), happybase_comp_cpu, marker='o', linestyle='--',
    #          label='happybase-compressed cpu')
    #
    # plt.xlabel("Data size")
    # plt.ylabel("Time")
    # plt.xticks(range(len(data_size)), data_size, size='small')
    # plt.legend(loc=2)
    # plt.show()

    plt.plot(data_size, dist_uncomp, marker='o', label='dist-uncompressed total')
    plt.plot(data_size, dist_comp, marker='D', label='dist-compressed total')
    plt.plot(data_size, happybase_uncomp_tot, marker='o', label='happybase-uncompressed total')
    plt.plot(data_size, happybase_comp_tot, marker='D', label='happybase-compressed total')

    plt.plot(data_size, happybase_uncomp_io, marker='o', linestyle='--',
             label='happybase-uncompressed io')
    plt.plot(data_size, happybase_comp_io, marker='D', linestyle='--',
             label='happybase-compressed io')
    plt.plot(data_size, happybase_uncomp_cpu, marker='o', linestyle='--',
             label='happybase-uncompressed cpu')
    plt.plot(data_size, happybase_comp_cpu, marker='D', linestyle='--',
             label='happybase-compressed cpu')

    plt.xlabel("Data size")
    plt.ylabel("Time")
    plt.legend(loc=2)
    plt.show()


    # plt.subplot(2, 2, 1)
    # plt.plot(data_size, dist_uncomp, marker='o', label='dist-uncompressed total')
    # plt.plot(data_size, dist_comp, marker='o', label='dist-compressed total')
    # # plt.title('Performance with(out) Compression')
    # plt.ylabel("Time")
    # plt.legend(loc=2)
    #
    # plt.subplot(2, 2, 2)
    # plt.plot(data_size, happybase_uncomp_tot, marker='o', label='happybase-uncompressed total')
    # plt.plot(data_size, happybase_comp_tot, marker='o', label='happybase-compressed total')
    # plt.legend(loc=2)
    #
    # plt.subplot(2, 2, 3)
    # plt.plot(data_size, happybase_uncomp_io, marker='o', linestyle='--',
    #          label='happybase-uncompressed io')
    # plt.plot(data_size, happybase_comp_io, marker='o', linestyle='--',
    #          label='happybase-compressed io')
    # plt.ylabel("Time")
    # plt.xlabel("Data size")
    # plt.legend(loc=2)
    #
    # plt.subplot(2, 2, 4)
    # plt.plot(data_size, happybase_uncomp_cpu, marker='o', linestyle='--',
    #          label='happybase-uncompressed cpu')
    # plt.plot(data_size, happybase_comp_cpu, marker='o', linestyle='--',
    #          label='happybase-compressed cpu')
    # plt.xlabel("Data size")
    # plt.legend(loc=2)
    # plt.show()

    # plt.plot(data_size, dist_uncomp, marker='o', label='dist-uncompressed total')
    # plt.plot(data_size, dist_comp, marker='D', linestyle='--',label='dist-compressed total')
    # plt.xlabel("Data size")
    # plt.ylabel("Time(s)")
    # plt.legend(loc=2)
    # plt.show()
    #
    # plt.plot(data_size, happybase_uncomp_tot, marker='o', label='happybase-uncompressed total')
    # plt.plot(data_size, happybase_comp_tot, marker='D', linestyle='--',label='happybase-compressed total')
    # plt.xlabel("Data size")
    # plt.ylabel("Time(s)")
    # plt.legend(loc=2)
    # plt.show()
    #
    # plt.plot(data_size, happybase_uncomp_io, marker='o',
    #          label='happybase-uncompressed io')
    # plt.plot(data_size, happybase_comp_io, marker='D', linestyle='--',
    #          label='happybase-compressed io')
    # plt.xlabel("Data size")
    # plt.ylabel("Time(s)")
    # plt.legend(loc=2)
    # plt.show()
    #
    # plt.plot(data_size, happybase_uncomp_cpu, marker='o',
    #          label='happybase-uncompressed cpu')
    # plt.plot(data_size, happybase_comp_cpu, marker='D', linestyle='--',
    #          label='happybase-compressed cpu')
    #
    # plt.xlabel("Data size")
    # plt.ylabel("Time(s)")
    # plt.legend(loc=2)
    # plt.show()


if __name__ == '__main__':
    # anal_ILSVRC()
    # anal_ILSVRC_Test()
    # anal_0000()
    # print timeit.timeit("anal_ILSVRC()", setup="from __main__ import anal_ILSVRC", number=1)


    # pre_crop()
    # plot_line()
    anal_0000()
    pass