Skip to content


Satistics - Introduction

import numpy as np
import pandas as pd

1. Simple Statistical Exploration

A = np.random.rand(10,10)
array([[7.16622070e-01, 8.67937006e-01, 8.56064808e-01, 8.57568350e-01,
        3.67796230e-01, 2.14346363e-01, 6.31599734e-01, 3.57119296e-01,
        4.35844862e-01, 2.35483572e-01],
       [7.22209004e-01, 9.63310994e-01, 8.16973750e-01, 8.06853501e-01,
        6.89970795e-01, 5.83418094e-01, 2.65411361e-02, 7.51624520e-01,
        9.59391711e-02, 4.15581862e-01],
       [8.57467652e-01, 9.63049501e-01, 7.90326873e-01, 7.95018533e-01,
        7.18152201e-01, 4.38823904e-01, 6.45775017e-01, 7.60717860e-01,
        9.24224445e-01, 3.13183949e-01],
       [3.83836565e-01, 8.56498268e-01, 5.93058237e-01, 2.46908006e-01,
        3.10960918e-01, 6.21511447e-01, 2.91997569e-01, 8.13963097e-01,
        2.01951380e-01, 5.79517413e-01],
       [8.41978469e-02, 3.81861596e-01, 1.24320969e-01, 1.92482070e-01,
        4.70194470e-01, 9.69335675e-01, 2.07682914e-01, 8.99462218e-01,
        3.56681685e-01, 2.10451446e-01],
       [2.50727875e-01, 4.45871405e-01, 4.75621851e-01, 1.76392569e-01,
        4.44494388e-01, 1.97214743e-01, 2.23416686e-01, 9.79333246e-01,
        2.13115739e-01, 3.26530393e-01],
       [6.18496895e-01, 8.46193255e-01, 9.20304738e-01, 4.49655661e-01,
        6.66495695e-01, 7.23878345e-01, 4.71483753e-01, 3.35079701e-01,
        2.80457713e-01, 4.17035078e-01],
       [3.74931699e-01, 1.64469071e-02, 8.24518210e-01, 4.40508140e-01,
        3.04546586e-01, 9.61964988e-01, 9.13281778e-01, 1.08067818e-01,
        3.52596222e-01, 9.71936660e-01],
       [6.24254898e-01, 6.83181203e-01, 7.55043126e-01, 5.46638933e-01,
        1.40146074e-01, 8.45252325e-02, 2.54143985e-01, 4.85118512e-04,
        8.08711713e-01, 9.33249065e-01],
       [4.10573251e-01, 6.30441143e-01, 6.04971277e-01, 4.07647377e-01,
        8.25351233e-01, 5.19909474e-01, 9.76184695e-02, 8.11742544e-01,
        6.04160360e-01, 4.48347194e-01]])
(10, 10)
array([[ 0.06873698,  0.03278827,  0.03668793, -0.00331421, -0.05083651,
        -0.00850994,  0.02390898, -0.02626347,  0.03131505, -0.01397033],
       [ 0.03278827,  0.09816338,  0.01792033,  0.04059124,  0.01004886,
         0.02993195,  0.03987967, -0.05153719, -0.00568755,  0.03430207],
       [ 0.03668793,  0.01792033,  0.04244191, -0.00450095, -0.0155604 ,
         0.00718968,  0.00519803, -0.05720041,  0.00948249,  0.01091863],
       [-0.00331421,  0.04059124, -0.00450095,  0.05553323,  0.03406878,
         0.03667575,  0.01972246, -0.01604353, -0.00940025,  0.02010751],
       [-0.05083651,  0.01004886, -0.0155604 ,  0.03406878,  0.09681987,
         0.03356765, -0.00601017, -0.01500941, -0.0759788 ,  0.0324855 ],
       [-0.00850994,  0.02993195,  0.00718968,  0.03667575,  0.03356765,
         0.0581328 , -0.00229493, -0.04089154, -0.03034172,  0.03377649],
       [ 0.02390898,  0.03987967,  0.00519803,  0.01972246, -0.00601017,
        -0.00229493,  0.04676247,  0.00605692,  0.00312176,  0.00634826],
       [-0.02626347, -0.05153719, -0.05720041, -0.01604353, -0.01500941,
        -0.04089154,  0.00605692,  0.13017446,  0.01244017, -0.04495907],
       [ 0.03131505, -0.00568755,  0.00948249, -0.00940025, -0.0759788 ,
        -0.03034172,  0.00312176,  0.01244017,  0.11204687, -0.01455545],
       [-0.01397033,  0.03430207,  0.01091863,  0.02010751,  0.0324855 ,
         0.03377649,  0.00634826, -0.04495907, -0.01455545,  0.04560073]])

Mean and std of row

np.mean(A,axis =0)
array([0.50433178, 0.66547913, 0.67612038, 0.49196731, 0.49381086,
       0.53149283, 0.3763541 , 0.58175954, 0.42736833, 0.48513166])
np.std(A,axis =0)
array([0.22924647, 0.28962689, 0.22578381, 0.24259586, 0.21017855,
       0.2908006 , 0.26627669, 0.33170298, 0.25736132, 0.25495454])

Mean and std of col

np.mean(A,axis =1)
array([0.55403823, 0.58724228, 0.72067399, 0.49002029, 0.38966709,
       0.37327189, 0.57290808, 0.5268799 , 0.48303793, 0.53607623])
np.std(A,axis = 1)
array([0.2487233 , 0.29723231, 0.19544236, 0.22356186, 0.29519126,
       0.2287346 , 0.20514927, 0.34228207, 0.31755658, 0.20258494])

2. Feature Scaling:

    1. Mini-max scalar
A = (100*np.random.rand(10,10)).astype(int)
array([[59, 39, 55, 69, 55, 10, 89, 79, 71, 77],
       [36, 72, 69, 36, 38, 43, 21, 53, 65, 18],
       [29, 78, 85, 77, 11, 85, 73, 83, 47, 82],
       [17, 70, 94, 48, 10, 68, 92, 96, 12, 56],
       [ 5, 54, 30, 57, 86, 49, 10, 22, 17, 73],
       [46, 36, 63, 60, 64, 95, 76, 64, 13, 93],
       [79, 98, 56, 67, 38, 21, 16, 67, 15, 13],
       [45, 11, 90, 26, 64, 25, 10, 33, 97, 36],
       [86, 88, 68, 24, 40, 20, 17, 59, 83, 74],
       [67, 76, 64, 47, 73, 82, 77, 67, 15, 49]])
A_mm = (A - A.min())/(A.max()-A.min())
array([[0.58064516, 0.3655914 , 0.53763441, 0.68817204, 0.53763441,
        0.05376344, 0.90322581, 0.79569892, 0.70967742, 0.77419355],
       [0.33333333, 0.72043011, 0.68817204, 0.33333333, 0.35483871,
        0.40860215, 0.17204301, 0.51612903, 0.64516129, 0.13978495],
       [0.25806452, 0.78494624, 0.86021505, 0.77419355, 0.06451613,
        0.86021505, 0.7311828 , 0.83870968, 0.4516129 , 0.82795699],
       [0.12903226, 0.69892473, 0.95698925, 0.46236559, 0.05376344,
        0.67741935, 0.93548387, 0.97849462, 0.07526882, 0.5483871 ],
       [0.        , 0.52688172, 0.2688172 , 0.55913978, 0.87096774,
        0.47311828, 0.05376344, 0.1827957 , 0.12903226, 0.7311828 ],
       [0.44086022, 0.33333333, 0.62365591, 0.59139785, 0.6344086 ,
        0.96774194, 0.76344086, 0.6344086 , 0.08602151, 0.94623656],
       [0.79569892, 1.        , 0.5483871 , 0.66666667, 0.35483871,
        0.17204301, 0.11827957, 0.66666667, 0.10752688, 0.08602151],
       [0.43010753, 0.06451613, 0.91397849, 0.22580645, 0.6344086 ,
        0.21505376, 0.05376344, 0.30107527, 0.98924731, 0.33333333],
       [0.87096774, 0.89247312, 0.67741935, 0.20430108, 0.37634409,
        0.16129032, 0.12903226, 0.58064516, 0.83870968, 0.74193548],
       [0.66666667, 0.76344086, 0.6344086 , 0.4516129 , 0.7311828 ,
        0.82795699, 0.77419355, 0.66666667, 0.10752688, 0.47311828]])
from sklearn import preprocessing

A_scaled = preprocessing.MinMaxScaler().fit(A).transform(A)
C:\ProgramData\Anaconda3\lib\site-packages\sklearn\utils\ DataConversionWarning: Data with input dtype int32 was converted to float64 by MinMaxScaler.
  warnings.warn(msg, DataConversionWarning)

array([[0.66666667, 0.32183908, 0.390625  , 0.8490566 , 0.59210526,
        0.        , 0.96341463, 0.77027027, 0.69411765, 0.8       ],
       [0.38271605, 0.70114943, 0.609375  , 0.22641509, 0.36842105,
        0.38823529, 0.13414634, 0.41891892, 0.62352941, 0.0625    ],
       [0.2962963 , 0.77011494, 0.859375  , 1.        , 0.01315789,
        0.88235294, 0.76829268, 0.82432432, 0.41176471, 0.8625    ],
       [0.14814815, 0.67816092, 1.        , 0.45283019, 0.        ,
        0.68235294, 1.        , 1.        , 0.        , 0.5375    ],
       [0.        , 0.49425287, 0.        , 0.62264151, 1.        ,
        0.45882353, 0.        , 0.        , 0.05882353, 0.75      ],
       [0.50617284, 0.28735632, 0.515625  , 0.67924528, 0.71052632,
        1.        , 0.80487805, 0.56756757, 0.01176471, 1.        ],
       [0.91358025, 1.        , 0.40625   , 0.81132075, 0.36842105,
        0.12941176, 0.07317073, 0.60810811, 0.03529412, 0.        ],
       [0.49382716, 0.        , 0.9375    , 0.03773585, 0.71052632,
        0.17647059, 0.        , 0.14864865, 1.        , 0.2875    ],
       [1.        , 0.88505747, 0.59375   , 0.        , 0.39473684,
        0.11764706, 0.08536585, 0.5       , 0.83529412, 0.7625    ],
       [0.7654321 , 0.74712644, 0.53125   , 0.43396226, 0.82894737,
        0.84705882, 0.81707317, 0.60810811, 0.03529412, 0.45      ]])
    1. Standard Scalar
A = (100*np.random.rand(10,10)).astype(int)
array([[17, 26, 40, 45, 98, 42, 77, 67,  1, 98],
       [ 8, 52, 68,  2,  6, 89, 76, 44,  2,  4],
       [30, 57,  9, 48, 57, 26, 71, 42, 49,  0],
       [13, 18, 65, 81, 36,  2, 25,  7, 80, 18],
       [17, 88, 95, 57, 67, 38, 12, 78, 21, 64],
       [75, 41,  8, 72, 47, 77, 13, 98, 56, 71],
       [96, 14, 50, 87, 24,  9, 48, 25, 11, 25],
       [68, 16, 94, 30, 23, 41, 22, 96, 93, 33],
       [10, 72, 84, 29, 59, 40, 86, 31, 82, 97],
       [72, 33, 16, 78, 12,  6, 63, 25, 28, 24]])
A_ss = A - A.mean()/A.std()
array([[15.48180308, 24.48180308, 38.48180308, 43.48180308, 96.48180308,
        40.48180308, 75.48180308, 65.48180308, -0.51819692, 96.48180308],
       [ 6.48180308, 50.48180308, 66.48180308,  0.48180308,  4.48180308,
        87.48180308, 74.48180308, 42.48180308,  0.48180308,  2.48180308],
       [28.48180308, 55.48180308,  7.48180308, 46.48180308, 55.48180308,
        24.48180308, 69.48180308, 40.48180308, 47.48180308, -1.51819692],
       [11.48180308, 16.48180308, 63.48180308, 79.48180308, 34.48180308,
         0.48180308, 23.48180308,  5.48180308, 78.48180308, 16.48180308],
       [15.48180308, 86.48180308, 93.48180308, 55.48180308, 65.48180308,
        36.48180308, 10.48180308, 76.48180308, 19.48180308, 62.48180308],
       [73.48180308, 39.48180308,  6.48180308, 70.48180308, 45.48180308,
        75.48180308, 11.48180308, 96.48180308, 54.48180308, 69.48180308],
       [94.48180308, 12.48180308, 48.48180308, 85.48180308, 22.48180308,
         7.48180308, 46.48180308, 23.48180308,  9.48180308, 23.48180308],
       [66.48180308, 14.48180308, 92.48180308, 28.48180308, 21.48180308,
        39.48180308, 20.48180308, 94.48180308, 91.48180308, 31.48180308],
       [ 8.48180308, 70.48180308, 82.48180308, 27.48180308, 57.48180308,
        38.48180308, 84.48180308, 29.48180308, 80.48180308, 95.48180308],
       [70.48180308, 31.48180308, 14.48180308, 76.48180308, 10.48180308,
         4.48180308, 61.48180308, 23.48180308, 26.48180308, 22.48180308]])
from sklearn import preprocessing

A_scaled = preprocessing.StandardScaler().fit(A).transform(A)
C:\ProgramData\Anaconda3\lib\site-packages\sklearn\utils\ DataConversionWarning: Data with input dtype int32 was converted to float64 by StandardScaler.
  warnings.warn(msg, DataConversionWarning)
C:\ProgramData\Anaconda3\lib\site-packages\sklearn\utils\ DataConversionWarning: Data with input dtype int32 was converted to float64 by StandardScaler.
  warnings.warn(msg, DataConversionWarning)

array([[-0.74717972, -0.65678879, -0.40228321, -0.30436663,  2.05218778,
         0.18323502,  1.00930339,  0.52228691, -1.25857909,  1.57983042],
       [-1.03212114,  0.43088692,  0.47088965, -1.96104578, -1.37433265,
         1.90564425,  0.97286644, -0.24284678, -1.22810502, -1.14002415],
       [-0.33559767,  0.64005532, -1.3690103 , -0.18878437,  0.5251515 ,
        -0.40311705,  0.79068171, -0.30938014,  0.20417627, -1.25576265],
       [-0.87382035, -0.99145824,  0.37733541,  1.08262056, -0.25698903,
        -1.28264517, -0.88541777, -1.47371401,  1.14887244, -0.73493943],
       [-0.74717972,  1.93689944,  1.31287775,  0.15796243,  0.89759938,
         0.036647  , -1.35909806,  0.88822041, -0.64909769,  0.59605324],
       [ 1.08910942, -0.02928358, -1.40019504,  0.73587376,  0.15270363,
         1.46588019, -1.32266112,  1.55355405,  0.41749476,  0.7985956 ],
       [ 1.75397273, -1.15879297, -0.09043576,  1.31378509, -0.70392648,
        -1.02611613, -0.04736803, -0.87491373, -0.95383839, -0.53239707],
       [ 0.86748832, -1.07512561,  1.28169301, -0.88227796, -0.74117127,
         0.14658802, -0.99472861,  1.48702068,  1.54503535, -0.30092008],
       [-0.96880082,  1.26756054,  0.96984556, -0.92080539,  0.59964108,
         0.10994101,  1.33723589, -0.67531364,  1.20982058,  1.5508958 ],
       [ 0.99412895, -0.36395303, -1.15071708,  0.96703829, -1.15086393,
        -1.13605715,  0.49918615, -0.87491373, -0.4357792 , -0.56133169]])

Baye's Theorem:

An important philosophy in Machine Learning and Statistics.

Important Topics: 1. Bayesian Classifier : Example of Machine Learning in Bayesian theory 2. Boltzman Machine: Example of Machine Learning for Physicist. 4. Quantum Boltzman Machine: Fusion of Machine Learning and Quantum Physics 5. Markove Random Field: Is there a Field Theory in Machine Learning?

A simple demo illustration Baye's theorm

A = (100*np.random.rand(10,4)).astype(int)
array([[38, 15, 66, 92],
       [16, 60, 96, 18],
       [18, 80, 14, 60],
       [88, 41, 31, 33],
       [88, 95, 24, 98],
       [39, 64, 97, 76],
       [17, 20,  1, 87],
       [90, 34,  2, 70],
       [81, 10, 93, 86],
       [60, 56,  3, 63]])
df = pd.DataFrame(A, columns= ['bud','green','ripen','rotten'],\
                  index= ['apple','guava','mango','orange','banana','pear','papaya','strwberrey','grape','melon'])
bud green ripen rotten
apple 38 15 66 92
guava 16 60 96 18
mango 18 80 14 60
orange 88 41 31 33
banana 88 95 24 98
pear 39 64 97 76
papaya 17 20 1 87
strwberrey 90 34 2 70
grape 81 10 93 86
melon 60 56 3 63
bud       535
green     475
ripen     427
rotten    683
dtype: int64
bud       38
green     15
ripen     66
rotten    92
Name: apple, dtype: int32
apple         66
guava         96
mango         14
orange        31
banana        24
pear          97
papaya         1
strwberrey     2
grape         93
melon          3
Name: ripen, dtype: int32
  • P(ripen|apple) = ?
  • P(apple|ripen) = ?

Implementing Baye's theorem

  • $P(apple|ripen) = \frac{p(ripen|apple)*p(apple)}{p(ripen)}$
p_apple = df.loc['apple',:].sum()/sum(df.sum())
p_ripen_given_apple = df.loc['apple','ripen']/df.loc['apple',:].sum()
p_ripen = df.loc[:,'ripen'].sum()/sum(df.sum())

p_apple_given_ripen = (p_ripen_given_apple*p_apple)/p_ripen

Mini Assignment:

  1. Generate a randum array of size 10 by 10 and convert it to pandas dataframe with column name and row name of your interest
  2. Perform feature scaling (minimax scalar and standard scalar) by direct calculation of minimunm, maximum, mean, standard deviation from dataframe/array
  3. Implement Scikit-learn preprocessing pipeline to perform feature scaling