Skip to content

Statistics

Satistics - Introduction

https://numpy.org/doc/stable/reference/routines.statistics.html

import numpy as np
import pandas as pd

1. Simple Statistical Exploration

A = np.random.rand(10,10)
A
array([[7.16622070e-01, 8.67937006e-01, 8.56064808e-01, 8.57568350e-01,
        3.67796230e-01, 2.14346363e-01, 6.31599734e-01, 3.57119296e-01,
        4.35844862e-01, 2.35483572e-01],
       [7.22209004e-01, 9.63310994e-01, 8.16973750e-01, 8.06853501e-01,
        6.89970795e-01, 5.83418094e-01, 2.65411361e-02, 7.51624520e-01,
        9.59391711e-02, 4.15581862e-01],
       [8.57467652e-01, 9.63049501e-01, 7.90326873e-01, 7.95018533e-01,
        7.18152201e-01, 4.38823904e-01, 6.45775017e-01, 7.60717860e-01,
        9.24224445e-01, 3.13183949e-01],
       [3.83836565e-01, 8.56498268e-01, 5.93058237e-01, 2.46908006e-01,
        3.10960918e-01, 6.21511447e-01, 2.91997569e-01, 8.13963097e-01,
        2.01951380e-01, 5.79517413e-01],
       [8.41978469e-02, 3.81861596e-01, 1.24320969e-01, 1.92482070e-01,
        4.70194470e-01, 9.69335675e-01, 2.07682914e-01, 8.99462218e-01,
        3.56681685e-01, 2.10451446e-01],
       [2.50727875e-01, 4.45871405e-01, 4.75621851e-01, 1.76392569e-01,
        4.44494388e-01, 1.97214743e-01, 2.23416686e-01, 9.79333246e-01,
        2.13115739e-01, 3.26530393e-01],
       [6.18496895e-01, 8.46193255e-01, 9.20304738e-01, 4.49655661e-01,
        6.66495695e-01, 7.23878345e-01, 4.71483753e-01, 3.35079701e-01,
        2.80457713e-01, 4.17035078e-01],
       [3.74931699e-01, 1.64469071e-02, 8.24518210e-01, 4.40508140e-01,
        3.04546586e-01, 9.61964988e-01, 9.13281778e-01, 1.08067818e-01,
        3.52596222e-01, 9.71936660e-01],
       [6.24254898e-01, 6.83181203e-01, 7.55043126e-01, 5.46638933e-01,
        1.40146074e-01, 8.45252325e-02, 2.54143985e-01, 4.85118512e-04,
        8.08711713e-01, 9.33249065e-01],
       [4.10573251e-01, 6.30441143e-01, 6.04971277e-01, 4.07647377e-01,
        8.25351233e-01, 5.19909474e-01, 9.76184695e-02, 8.11742544e-01,
        6.04160360e-01, 4.48347194e-01]])
A.shape
(10, 10)
A.max()
0.9793332463866128
A.min()
0.00048511851159238617
A.mean()
0.523381592476641
np.median(A)
0.47355280212201306
A.std()
0.2772531462004282
np.cov(A)
array([[ 0.06873698,  0.03278827,  0.03668793, -0.00331421, -0.05083651,
        -0.00850994,  0.02390898, -0.02626347,  0.03131505, -0.01397033],
       [ 0.03278827,  0.09816338,  0.01792033,  0.04059124,  0.01004886,
         0.02993195,  0.03987967, -0.05153719, -0.00568755,  0.03430207],
       [ 0.03668793,  0.01792033,  0.04244191, -0.00450095, -0.0155604 ,
         0.00718968,  0.00519803, -0.05720041,  0.00948249,  0.01091863],
       [-0.00331421,  0.04059124, -0.00450095,  0.05553323,  0.03406878,
         0.03667575,  0.01972246, -0.01604353, -0.00940025,  0.02010751],
       [-0.05083651,  0.01004886, -0.0155604 ,  0.03406878,  0.09681987,
         0.03356765, -0.00601017, -0.01500941, -0.0759788 ,  0.0324855 ],
       [-0.00850994,  0.02993195,  0.00718968,  0.03667575,  0.03356765,
         0.0581328 , -0.00229493, -0.04089154, -0.03034172,  0.03377649],
       [ 0.02390898,  0.03987967,  0.00519803,  0.01972246, -0.00601017,
        -0.00229493,  0.04676247,  0.00605692,  0.00312176,  0.00634826],
       [-0.02626347, -0.05153719, -0.05720041, -0.01604353, -0.01500941,
        -0.04089154,  0.00605692,  0.13017446,  0.01244017, -0.04495907],
       [ 0.03131505, -0.00568755,  0.00948249, -0.00940025, -0.0759788 ,
        -0.03034172,  0.00312176,  0.01244017,  0.11204687, -0.01455545],
       [-0.01397033,  0.03430207,  0.01091863,  0.02010751,  0.0324855 ,
         0.03377649,  0.00634826, -0.04495907, -0.01455545,  0.04560073]])

Mean and std of row

np.mean(A,axis =0)
array([0.50433178, 0.66547913, 0.67612038, 0.49196731, 0.49381086,
       0.53149283, 0.3763541 , 0.58175954, 0.42736833, 0.48513166])
np.std(A,axis =0)
array([0.22924647, 0.28962689, 0.22578381, 0.24259586, 0.21017855,
       0.2908006 , 0.26627669, 0.33170298, 0.25736132, 0.25495454])

Mean and std of col

np.mean(A,axis =1)
array([0.55403823, 0.58724228, 0.72067399, 0.49002029, 0.38966709,
       0.37327189, 0.57290808, 0.5268799 , 0.48303793, 0.53607623])
np.std(A,axis = 1)
array([0.2487233 , 0.29723231, 0.19544236, 0.22356186, 0.29519126,
       0.2287346 , 0.20514927, 0.34228207, 0.31755658, 0.20258494])

2. Feature Scaling:

    1. Mini-max scalar
A = (100*np.random.rand(10,10)).astype(int)
A
array([[59, 39, 55, 69, 55, 10, 89, 79, 71, 77],
       [36, 72, 69, 36, 38, 43, 21, 53, 65, 18],
       [29, 78, 85, 77, 11, 85, 73, 83, 47, 82],
       [17, 70, 94, 48, 10, 68, 92, 96, 12, 56],
       [ 5, 54, 30, 57, 86, 49, 10, 22, 17, 73],
       [46, 36, 63, 60, 64, 95, 76, 64, 13, 93],
       [79, 98, 56, 67, 38, 21, 16, 67, 15, 13],
       [45, 11, 90, 26, 64, 25, 10, 33, 97, 36],
       [86, 88, 68, 24, 40, 20, 17, 59, 83, 74],
       [67, 76, 64, 47, 73, 82, 77, 67, 15, 49]])
A_mm = (A - A.min())/(A.max()-A.min())
A_mm
array([[0.58064516, 0.3655914 , 0.53763441, 0.68817204, 0.53763441,
        0.05376344, 0.90322581, 0.79569892, 0.70967742, 0.77419355],
       [0.33333333, 0.72043011, 0.68817204, 0.33333333, 0.35483871,
        0.40860215, 0.17204301, 0.51612903, 0.64516129, 0.13978495],
       [0.25806452, 0.78494624, 0.86021505, 0.77419355, 0.06451613,
        0.86021505, 0.7311828 , 0.83870968, 0.4516129 , 0.82795699],
       [0.12903226, 0.69892473, 0.95698925, 0.46236559, 0.05376344,
        0.67741935, 0.93548387, 0.97849462, 0.07526882, 0.5483871 ],
       [0.        , 0.52688172, 0.2688172 , 0.55913978, 0.87096774,
        0.47311828, 0.05376344, 0.1827957 , 0.12903226, 0.7311828 ],
       [0.44086022, 0.33333333, 0.62365591, 0.59139785, 0.6344086 ,
        0.96774194, 0.76344086, 0.6344086 , 0.08602151, 0.94623656],
       [0.79569892, 1.        , 0.5483871 , 0.66666667, 0.35483871,
        0.17204301, 0.11827957, 0.66666667, 0.10752688, 0.08602151],
       [0.43010753, 0.06451613, 0.91397849, 0.22580645, 0.6344086 ,
        0.21505376, 0.05376344, 0.30107527, 0.98924731, 0.33333333],
       [0.87096774, 0.89247312, 0.67741935, 0.20430108, 0.37634409,
        0.16129032, 0.12903226, 0.58064516, 0.83870968, 0.74193548],
       [0.66666667, 0.76344086, 0.6344086 , 0.4516129 , 0.7311828 ,
        0.82795699, 0.77419355, 0.66666667, 0.10752688, 0.47311828]])
from sklearn import preprocessing

A_scaled = preprocessing.MinMaxScaler().fit(A).transform(A)
A_scaled
C:\ProgramData\Anaconda3\lib\site-packages\sklearn\utils\validation.py:595: DataConversionWarning: Data with input dtype int32 was converted to float64 by MinMaxScaler.
  warnings.warn(msg, DataConversionWarning)





array([[0.66666667, 0.32183908, 0.390625  , 0.8490566 , 0.59210526,
        0.        , 0.96341463, 0.77027027, 0.69411765, 0.8       ],
       [0.38271605, 0.70114943, 0.609375  , 0.22641509, 0.36842105,
        0.38823529, 0.13414634, 0.41891892, 0.62352941, 0.0625    ],
       [0.2962963 , 0.77011494, 0.859375  , 1.        , 0.01315789,
        0.88235294, 0.76829268, 0.82432432, 0.41176471, 0.8625    ],
       [0.14814815, 0.67816092, 1.        , 0.45283019, 0.        ,
        0.68235294, 1.        , 1.        , 0.        , 0.5375    ],
       [0.        , 0.49425287, 0.        , 0.62264151, 1.        ,
        0.45882353, 0.        , 0.        , 0.05882353, 0.75      ],
       [0.50617284, 0.28735632, 0.515625  , 0.67924528, 0.71052632,
        1.        , 0.80487805, 0.56756757, 0.01176471, 1.        ],
       [0.91358025, 1.        , 0.40625   , 0.81132075, 0.36842105,
        0.12941176, 0.07317073, 0.60810811, 0.03529412, 0.        ],
       [0.49382716, 0.        , 0.9375    , 0.03773585, 0.71052632,
        0.17647059, 0.        , 0.14864865, 1.        , 0.2875    ],
       [1.        , 0.88505747, 0.59375   , 0.        , 0.39473684,
        0.11764706, 0.08536585, 0.5       , 0.83529412, 0.7625    ],
       [0.7654321 , 0.74712644, 0.53125   , 0.43396226, 0.82894737,
        0.84705882, 0.81707317, 0.60810811, 0.03529412, 0.45      ]])
    1. Standard Scalar
A = (100*np.random.rand(10,10)).astype(int)
A
array([[17, 26, 40, 45, 98, 42, 77, 67,  1, 98],
       [ 8, 52, 68,  2,  6, 89, 76, 44,  2,  4],
       [30, 57,  9, 48, 57, 26, 71, 42, 49,  0],
       [13, 18, 65, 81, 36,  2, 25,  7, 80, 18],
       [17, 88, 95, 57, 67, 38, 12, 78, 21, 64],
       [75, 41,  8, 72, 47, 77, 13, 98, 56, 71],
       [96, 14, 50, 87, 24,  9, 48, 25, 11, 25],
       [68, 16, 94, 30, 23, 41, 22, 96, 93, 33],
       [10, 72, 84, 29, 59, 40, 86, 31, 82, 97],
       [72, 33, 16, 78, 12,  6, 63, 25, 28, 24]])
A_ss = A - A.mean()/A.std()
A_ss
array([[15.48180308, 24.48180308, 38.48180308, 43.48180308, 96.48180308,
        40.48180308, 75.48180308, 65.48180308, -0.51819692, 96.48180308],
       [ 6.48180308, 50.48180308, 66.48180308,  0.48180308,  4.48180308,
        87.48180308, 74.48180308, 42.48180308,  0.48180308,  2.48180308],
       [28.48180308, 55.48180308,  7.48180308, 46.48180308, 55.48180308,
        24.48180308, 69.48180308, 40.48180308, 47.48180308, -1.51819692],
       [11.48180308, 16.48180308, 63.48180308, 79.48180308, 34.48180308,
         0.48180308, 23.48180308,  5.48180308, 78.48180308, 16.48180308],
       [15.48180308, 86.48180308, 93.48180308, 55.48180308, 65.48180308,
        36.48180308, 10.48180308, 76.48180308, 19.48180308, 62.48180308],
       [73.48180308, 39.48180308,  6.48180308, 70.48180308, 45.48180308,
        75.48180308, 11.48180308, 96.48180308, 54.48180308, 69.48180308],
       [94.48180308, 12.48180308, 48.48180308, 85.48180308, 22.48180308,
         7.48180308, 46.48180308, 23.48180308,  9.48180308, 23.48180308],
       [66.48180308, 14.48180308, 92.48180308, 28.48180308, 21.48180308,
        39.48180308, 20.48180308, 94.48180308, 91.48180308, 31.48180308],
       [ 8.48180308, 70.48180308, 82.48180308, 27.48180308, 57.48180308,
        38.48180308, 84.48180308, 29.48180308, 80.48180308, 95.48180308],
       [70.48180308, 31.48180308, 14.48180308, 76.48180308, 10.48180308,
         4.48180308, 61.48180308, 23.48180308, 26.48180308, 22.48180308]])
from sklearn import preprocessing

A_scaled = preprocessing.StandardScaler().fit(A).transform(A)
A_scaled
C:\ProgramData\Anaconda3\lib\site-packages\sklearn\utils\validation.py:595: DataConversionWarning: Data with input dtype int32 was converted to float64 by StandardScaler.
  warnings.warn(msg, DataConversionWarning)
C:\ProgramData\Anaconda3\lib\site-packages\sklearn\utils\validation.py:595: DataConversionWarning: Data with input dtype int32 was converted to float64 by StandardScaler.
  warnings.warn(msg, DataConversionWarning)





array([[-0.74717972, -0.65678879, -0.40228321, -0.30436663,  2.05218778,
         0.18323502,  1.00930339,  0.52228691, -1.25857909,  1.57983042],
       [-1.03212114,  0.43088692,  0.47088965, -1.96104578, -1.37433265,
         1.90564425,  0.97286644, -0.24284678, -1.22810502, -1.14002415],
       [-0.33559767,  0.64005532, -1.3690103 , -0.18878437,  0.5251515 ,
        -0.40311705,  0.79068171, -0.30938014,  0.20417627, -1.25576265],
       [-0.87382035, -0.99145824,  0.37733541,  1.08262056, -0.25698903,
        -1.28264517, -0.88541777, -1.47371401,  1.14887244, -0.73493943],
       [-0.74717972,  1.93689944,  1.31287775,  0.15796243,  0.89759938,
         0.036647  , -1.35909806,  0.88822041, -0.64909769,  0.59605324],
       [ 1.08910942, -0.02928358, -1.40019504,  0.73587376,  0.15270363,
         1.46588019, -1.32266112,  1.55355405,  0.41749476,  0.7985956 ],
       [ 1.75397273, -1.15879297, -0.09043576,  1.31378509, -0.70392648,
        -1.02611613, -0.04736803, -0.87491373, -0.95383839, -0.53239707],
       [ 0.86748832, -1.07512561,  1.28169301, -0.88227796, -0.74117127,
         0.14658802, -0.99472861,  1.48702068,  1.54503535, -0.30092008],
       [-0.96880082,  1.26756054,  0.96984556, -0.92080539,  0.59964108,
         0.10994101,  1.33723589, -0.67531364,  1.20982058,  1.5508958 ],
       [ 0.99412895, -0.36395303, -1.15071708,  0.96703829, -1.15086393,
        -1.13605715,  0.49918615, -0.87491373, -0.4357792 , -0.56133169]])

Baye's Theorem:

An important philosophy in Machine Learning and Statistics.

Important Topics: 1. Bayesian Classifier : Example of Machine Learning in Bayesian theory 2. Boltzman Machine: Example of Machine Learning for Physicist. 4. Quantum Boltzman Machine: Fusion of Machine Learning and Quantum Physics 5. Markove Random Field: Is there a Field Theory in Machine Learning?

A simple demo illustration Baye's theorm

A = (100*np.random.rand(10,4)).astype(int)
A 
array([[38, 15, 66, 92],
       [16, 60, 96, 18],
       [18, 80, 14, 60],
       [88, 41, 31, 33],
       [88, 95, 24, 98],
       [39, 64, 97, 76],
       [17, 20,  1, 87],
       [90, 34,  2, 70],
       [81, 10, 93, 86],
       [60, 56,  3, 63]])
df = pd.DataFrame(A, columns= ['bud','green','ripen','rotten'],\
                  index= ['apple','guava','mango','orange','banana','pear','papaya','strwberrey','grape','melon'])
df
bud green ripen rotten
apple 38 15 66 92
guava 16 60 96 18
mango 18 80 14 60
orange 88 41 31 33
banana 88 95 24 98
pear 39 64 97 76
papaya 17 20 1 87
strwberrey 90 34 2 70
grape 81 10 93 86
melon 60 56 3 63
df.sum()
bud       535
green     475
ripen     427
rotten    683
dtype: int64
sum(df.sum())
2120
df.loc['apple','bud']
38
df.loc['apple',:]
bud       38
green     15
ripen     66
rotten    92
Name: apple, dtype: int32
df.loc[:,'ripen']
apple         66
guava         96
mango         14
orange        31
banana        24
pear          97
papaya         1
strwberrey     2
grape         93
melon          3
Name: ripen, dtype: int32
  • P(ripen|apple) = ?
df.loc['apple','ripen']/df.loc['apple',:].sum()
0.3127962085308057
  • P(apple|ripen) = ?
df.loc['apple','ripen']/df.loc[:,'ripen'].sum()
0.15456674473067916

Implementing Baye's theorem

  • $P(apple|ripen) = \frac{p(ripen|apple)*p(apple)}{p(ripen)}$
p_apple = df.loc['apple',:].sum()/sum(df.sum())
p_ripen_given_apple = df.loc['apple','ripen']/df.loc['apple',:].sum()
p_ripen = df.loc[:,'ripen'].sum()/sum(df.sum())

p_apple_given_ripen = (p_ripen_given_apple*p_apple)/p_ripen
p_apple_given_ripen
0.15456674473067916

Mini Assignment:

  1. Generate a randum array of size 10 by 10 and convert it to pandas dataframe with column name and row name of your interest
  2. Perform feature scaling (minimax scalar and standard scalar) by direct calculation of minimunm, maximum, mean, standard deviation from dataframe/array
  3. Implement Scikit-learn preprocessing pipeline to perform feature scaling