Statistics

Satistics - Introduction

https://numpy.org/doc/stable/reference/routines.statistics.html

import numpy as np
import pandas as pd

1. Simple Statistical Exploration

A = np.random.rand(10,10)
A

array([[7.16622070e-01, 8.67937006e-01, 8.56064808e-01, 8.57568350e-01,
        3.67796230e-01, 2.14346363e-01, 6.31599734e-01, 3.57119296e-01,
        4.35844862e-01, 2.35483572e-01],
       [7.22209004e-01, 9.63310994e-01, 8.16973750e-01, 8.06853501e-01,
        6.89970795e-01, 5.83418094e-01, 2.65411361e-02, 7.51624520e-01,
        9.59391711e-02, 4.15581862e-01],
       [8.57467652e-01, 9.63049501e-01, 7.90326873e-01, 7.95018533e-01,
        7.18152201e-01, 4.38823904e-01, 6.45775017e-01, 7.60717860e-01,
        9.24224445e-01, 3.13183949e-01],
       [3.83836565e-01, 8.56498268e-01, 5.93058237e-01, 2.46908006e-01,
        3.10960918e-01, 6.21511447e-01, 2.91997569e-01, 8.13963097e-01,
        2.01951380e-01, 5.79517413e-01],
       [8.41978469e-02, 3.81861596e-01, 1.24320969e-01, 1.92482070e-01,
        4.70194470e-01, 9.69335675e-01, 2.07682914e-01, 8.99462218e-01,
        3.56681685e-01, 2.10451446e-01],
       [2.50727875e-01, 4.45871405e-01, 4.75621851e-01, 1.76392569e-01,
        4.44494388e-01, 1.97214743e-01, 2.23416686e-01, 9.79333246e-01,
        2.13115739e-01, 3.26530393e-01],
       [6.18496895e-01, 8.46193255e-01, 9.20304738e-01, 4.49655661e-01,
        6.66495695e-01, 7.23878345e-01, 4.71483753e-01, 3.35079701e-01,
        2.80457713e-01, 4.17035078e-01],
       [3.74931699e-01, 1.64469071e-02, 8.24518210e-01, 4.40508140e-01,
        3.04546586e-01, 9.61964988e-01, 9.13281778e-01, 1.08067818e-01,
        3.52596222e-01, 9.71936660e-01],
       [6.24254898e-01, 6.83181203e-01, 7.55043126e-01, 5.46638933e-01,
        1.40146074e-01, 8.45252325e-02, 2.54143985e-01, 4.85118512e-04,
        8.08711713e-01, 9.33249065e-01],
       [4.10573251e-01, 6.30441143e-01, 6.04971277e-01, 4.07647377e-01,
        8.25351233e-01, 5.19909474e-01, 9.76184695e-02, 8.11742544e-01,
        6.04160360e-01, 4.48347194e-01]])

A.shape

(10, 10)

A.max()

0.9793332463866128

A.min()

0.00048511851159238617

A.mean()

0.523381592476641

np.median(A)

0.47355280212201306

A.std()

0.2772531462004282

np.cov(A)

array([[ 0.06873698,  0.03278827,  0.03668793, -0.00331421, -0.05083651,
        -0.00850994,  0.02390898, -0.02626347,  0.03131505, -0.01397033],
       [ 0.03278827,  0.09816338,  0.01792033,  0.04059124,  0.01004886,
         0.02993195,  0.03987967, -0.05153719, -0.00568755,  0.03430207],
       [ 0.03668793,  0.01792033,  0.04244191, -0.00450095, -0.0155604 ,
         0.00718968,  0.00519803, -0.05720041,  0.00948249,  0.01091863],
       [-0.00331421,  0.04059124, -0.00450095,  0.05553323,  0.03406878,
         0.03667575,  0.01972246, -0.01604353, -0.00940025,  0.02010751],
       [-0.05083651,  0.01004886, -0.0155604 ,  0.03406878,  0.09681987,
         0.03356765, -0.00601017, -0.01500941, -0.0759788 ,  0.0324855 ],
       [-0.00850994,  0.02993195,  0.00718968,  0.03667575,  0.03356765,
         0.0581328 , -0.00229493, -0.04089154, -0.03034172,  0.03377649],
       [ 0.02390898,  0.03987967,  0.00519803,  0.01972246, -0.00601017,
        -0.00229493,  0.04676247,  0.00605692,  0.00312176,  0.00634826],
       [-0.02626347, -0.05153719, -0.05720041, -0.01604353, -0.01500941,
        -0.04089154,  0.00605692,  0.13017446,  0.01244017, -0.04495907],
       [ 0.03131505, -0.00568755,  0.00948249, -0.00940025, -0.0759788 ,
        -0.03034172,  0.00312176,  0.01244017,  0.11204687, -0.01455545],
       [-0.01397033,  0.03430207,  0.01091863,  0.02010751,  0.0324855 ,
         0.03377649,  0.00634826, -0.04495907, -0.01455545,  0.04560073]])

Mean and std of row

np.mean(A,axis =0)

array([0.50433178, 0.66547913, 0.67612038, 0.49196731, 0.49381086,
       0.53149283, 0.3763541 , 0.58175954, 0.42736833, 0.48513166])

np.std(A,axis =0)

array([0.22924647, 0.28962689, 0.22578381, 0.24259586, 0.21017855,
       0.2908006 , 0.26627669, 0.33170298, 0.25736132, 0.25495454])

Mean and std of col

np.mean(A,axis =1)

array([0.55403823, 0.58724228, 0.72067399, 0.49002029, 0.38966709,
       0.37327189, 0.57290808, 0.5268799 , 0.48303793, 0.53607623])

np.std(A,axis = 1)

array([0.2487233 , 0.29723231, 0.19544236, 0.22356186, 0.29519126,
       0.2287346 , 0.20514927, 0.34228207, 0.31755658, 0.20258494])

2. Feature Scaling:

1. Mini-max scalar

A = (100*np.random.rand(10,10)).astype(int)
A

array([[59, 39, 55, 69, 55, 10, 89, 79, 71, 77],
       [36, 72, 69, 36, 38, 43, 21, 53, 65, 18],
       [29, 78, 85, 77, 11, 85, 73, 83, 47, 82],
       [17, 70, 94, 48, 10, 68, 92, 96, 12, 56],
       [ 5, 54, 30, 57, 86, 49, 10, 22, 17, 73],
       [46, 36, 63, 60, 64, 95, 76, 64, 13, 93],
       [79, 98, 56, 67, 38, 21, 16, 67, 15, 13],
       [45, 11, 90, 26, 64, 25, 10, 33, 97, 36],
       [86, 88, 68, 24, 40, 20, 17, 59, 83, 74],
       [67, 76, 64, 47, 73, 82, 77, 67, 15, 49]])

A_mm = (A - A.min())/(A.max()-A.min())

A_mm

array([[0.58064516, 0.3655914 , 0.53763441, 0.68817204, 0.53763441,
        0.05376344, 0.90322581, 0.79569892, 0.70967742, 0.77419355],
       [0.33333333, 0.72043011, 0.68817204, 0.33333333, 0.35483871,
        0.40860215, 0.17204301, 0.51612903, 0.64516129, 0.13978495],
       [0.25806452, 0.78494624, 0.86021505, 0.77419355, 0.06451613,
        0.86021505, 0.7311828 , 0.83870968, 0.4516129 , 0.82795699],
       [0.12903226, 0.69892473, 0.95698925, 0.46236559, 0.05376344,
        0.67741935, 0.93548387, 0.97849462, 0.07526882, 0.5483871 ],
       [0.        , 0.52688172, 0.2688172 , 0.55913978, 0.87096774,
        0.47311828, 0.05376344, 0.1827957 , 0.12903226, 0.7311828 ],
       [0.44086022, 0.33333333, 0.62365591, 0.59139785, 0.6344086 ,
        0.96774194, 0.76344086, 0.6344086 , 0.08602151, 0.94623656],
       [0.79569892, 1.        , 0.5483871 , 0.66666667, 0.35483871,
        0.17204301, 0.11827957, 0.66666667, 0.10752688, 0.08602151],
       [0.43010753, 0.06451613, 0.91397849, 0.22580645, 0.6344086 ,
        0.21505376, 0.05376344, 0.30107527, 0.98924731, 0.33333333],
       [0.87096774, 0.89247312, 0.67741935, 0.20430108, 0.37634409,
        0.16129032, 0.12903226, 0.58064516, 0.83870968, 0.74193548],
       [0.66666667, 0.76344086, 0.6344086 , 0.4516129 , 0.7311828 ,
        0.82795699, 0.77419355, 0.66666667, 0.10752688, 0.47311828]])

from sklearn import preprocessing

A_scaled = preprocessing.MinMaxScaler().fit(A).transform(A)
A_scaled

C:\ProgramData\Anaconda3\lib\site-packages\sklearn\utils\validation.py:595: DataConversionWarning: Data with input dtype int32 was converted to float64 by MinMaxScaler.
  warnings.warn(msg, DataConversionWarning)





array([[0.66666667, 0.32183908, 0.390625  , 0.8490566 , 0.59210526,
        0.        , 0.96341463, 0.77027027, 0.69411765, 0.8       ],
       [0.38271605, 0.70114943, 0.609375  , 0.22641509, 0.36842105,
        0.38823529, 0.13414634, 0.41891892, 0.62352941, 0.0625    ],
       [0.2962963 , 0.77011494, 0.859375  , 1.        , 0.01315789,
        0.88235294, 0.76829268, 0.82432432, 0.41176471, 0.8625    ],
       [0.14814815, 0.67816092, 1.        , 0.45283019, 0.        ,
        0.68235294, 1.        , 1.        , 0.        , 0.5375    ],
       [0.        , 0.49425287, 0.        , 0.62264151, 1.        ,
        0.45882353, 0.        , 0.        , 0.05882353, 0.75      ],
       [0.50617284, 0.28735632, 0.515625  , 0.67924528, 0.71052632,
        1.        , 0.80487805, 0.56756757, 0.01176471, 1.        ],
       [0.91358025, 1.        , 0.40625   , 0.81132075, 0.36842105,
        0.12941176, 0.07317073, 0.60810811, 0.03529412, 0.        ],
       [0.49382716, 0.        , 0.9375    , 0.03773585, 0.71052632,
        0.17647059, 0.        , 0.14864865, 1.        , 0.2875    ],
       [1.        , 0.88505747, 0.59375   , 0.        , 0.39473684,
        0.11764706, 0.08536585, 0.5       , 0.83529412, 0.7625    ],
       [0.7654321 , 0.74712644, 0.53125   , 0.43396226, 0.82894737,
        0.84705882, 0.81707317, 0.60810811, 0.03529412, 0.45      ]])

1. Standard Scalar

A = (100*np.random.rand(10,10)).astype(int)
A

array([[17, 26, 40, 45, 98, 42, 77, 67,  1, 98],
       [ 8, 52, 68,  2,  6, 89, 76, 44,  2,  4],
       [30, 57,  9, 48, 57, 26, 71, 42, 49,  0],
       [13, 18, 65, 81, 36,  2, 25,  7, 80, 18],
       [17, 88, 95, 57, 67, 38, 12, 78, 21, 64],
       [75, 41,  8, 72, 47, 77, 13, 98, 56, 71],
       [96, 14, 50, 87, 24,  9, 48, 25, 11, 25],
       [68, 16, 94, 30, 23, 41, 22, 96, 93, 33],
       [10, 72, 84, 29, 59, 40, 86, 31, 82, 97],
       [72, 33, 16, 78, 12,  6, 63, 25, 28, 24]])

A_ss = A - A.mean()/A.std()

A_ss

array([[15.48180308, 24.48180308, 38.48180308, 43.48180308, 96.48180308,
        40.48180308, 75.48180308, 65.48180308, -0.51819692, 96.48180308],
       [ 6.48180308, 50.48180308, 66.48180308,  0.48180308,  4.48180308,
        87.48180308, 74.48180308, 42.48180308,  0.48180308,  2.48180308],
       [28.48180308, 55.48180308,  7.48180308, 46.48180308, 55.48180308,
        24.48180308, 69.48180308, 40.48180308, 47.48180308, -1.51819692],
       [11.48180308, 16.48180308, 63.48180308, 79.48180308, 34.48180308,
         0.48180308, 23.48180308,  5.48180308, 78.48180308, 16.48180308],
       [15.48180308, 86.48180308, 93.48180308, 55.48180308, 65.48180308,
        36.48180308, 10.48180308, 76.48180308, 19.48180308, 62.48180308],
       [73.48180308, 39.48180308,  6.48180308, 70.48180308, 45.48180308,
        75.48180308, 11.48180308, 96.48180308, 54.48180308, 69.48180308],
       [94.48180308, 12.48180308, 48.48180308, 85.48180308, 22.48180308,
         7.48180308, 46.48180308, 23.48180308,  9.48180308, 23.48180308],
       [66.48180308, 14.48180308, 92.48180308, 28.48180308, 21.48180308,
        39.48180308, 20.48180308, 94.48180308, 91.48180308, 31.48180308],
       [ 8.48180308, 70.48180308, 82.48180308, 27.48180308, 57.48180308,
        38.48180308, 84.48180308, 29.48180308, 80.48180308, 95.48180308],
       [70.48180308, 31.48180308, 14.48180308, 76.48180308, 10.48180308,
         4.48180308, 61.48180308, 23.48180308, 26.48180308, 22.48180308]])

from sklearn import preprocessing

A_scaled = preprocessing.StandardScaler().fit(A).transform(A)
A_scaled

C:\ProgramData\Anaconda3\lib\site-packages\sklearn\utils\validation.py:595: DataConversionWarning: Data with input dtype int32 was converted to float64 by StandardScaler.
  warnings.warn(msg, DataConversionWarning)
C:\ProgramData\Anaconda3\lib\site-packages\sklearn\utils\validation.py:595: DataConversionWarning: Data with input dtype int32 was converted to float64 by StandardScaler.
  warnings.warn(msg, DataConversionWarning)





array([[-0.74717972, -0.65678879, -0.40228321, -0.30436663,  2.05218778,
         0.18323502,  1.00930339,  0.52228691, -1.25857909,  1.57983042],
       [-1.03212114,  0.43088692,  0.47088965, -1.96104578, -1.37433265,
         1.90564425,  0.97286644, -0.24284678, -1.22810502, -1.14002415],
       [-0.33559767,  0.64005532, -1.3690103 , -0.18878437,  0.5251515 ,
        -0.40311705,  0.79068171, -0.30938014,  0.20417627, -1.25576265],
       [-0.87382035, -0.99145824,  0.37733541,  1.08262056, -0.25698903,
        -1.28264517, -0.88541777, -1.47371401,  1.14887244, -0.73493943],
       [-0.74717972,  1.93689944,  1.31287775,  0.15796243,  0.89759938,
         0.036647  , -1.35909806,  0.88822041, -0.64909769,  0.59605324],
       [ 1.08910942, -0.02928358, -1.40019504,  0.73587376,  0.15270363,
         1.46588019, -1.32266112,  1.55355405,  0.41749476,  0.7985956 ],
       [ 1.75397273, -1.15879297, -0.09043576,  1.31378509, -0.70392648,
        -1.02611613, -0.04736803, -0.87491373, -0.95383839, -0.53239707],
       [ 0.86748832, -1.07512561,  1.28169301, -0.88227796, -0.74117127,
         0.14658802, -0.99472861,  1.48702068,  1.54503535, -0.30092008],
       [-0.96880082,  1.26756054,  0.96984556, -0.92080539,  0.59964108,
         0.10994101,  1.33723589, -0.67531364,  1.20982058,  1.5508958 ],
       [ 0.99412895, -0.36395303, -1.15071708,  0.96703829, -1.15086393,
        -1.13605715,  0.49918615, -0.87491373, -0.4357792 , -0.56133169]])

Baye's Theorem:

An important philosophy in Machine Learning and Statistics.

Important Topics: 1. Bayesian Classifier : Example of Machine Learning in Bayesian theory 2. Boltzman Machine: Example of Machine Learning for Physicist. 4. Quantum Boltzman Machine: Fusion of Machine Learning and Quantum Physics 5. Markove Random Field: Is there a Field Theory in Machine Learning?

A simple demo illustration Baye's theorm

A = (100*np.random.rand(10,4)).astype(int)
A

array([[38, 15, 66, 92],
       [16, 60, 96, 18],
       [18, 80, 14, 60],
       [88, 41, 31, 33],
       [88, 95, 24, 98],
       [39, 64, 97, 76],
       [17, 20,  1, 87],
       [90, 34,  2, 70],
       [81, 10, 93, 86],
       [60, 56,  3, 63]])

df = pd.DataFrame(A, columns= ['bud','green','ripen','rotten'],\
                  index= ['apple','guava','mango','orange','banana','pear','papaya','strwberrey','grape','melon'])

df

	bud	green	ripen	rotten
apple	38	15	66	92
guava	16	60	96	18
mango	18	80	14	60
orange	88	41	31	33
banana	88	95	24	98
pear	39	64	97	76
papaya	17	20	1	87
strwberrey	90	34	2	70
grape	81	10	93	86
melon	60	56	3	63

df.sum()

bud       535
green     475
ripen     427
rotten    683
dtype: int64

sum(df.sum())

df.loc['apple','bud']

df.loc['apple',:]

bud       38
green     15
ripen     66
rotten    92
Name: apple, dtype: int32

df.loc[:,'ripen']

apple         66
guava         96
mango         14
orange        31
banana        24
pear          97
papaya         1
strwberrey     2
grape         93
melon          3
Name: ripen, dtype: int32

P(ripen|apple) = ?

df.loc['apple','ripen']/df.loc['apple',:].sum()

0.3127962085308057

P(apple|ripen) = ?

df.loc['apple','ripen']/df.loc[:,'ripen'].sum()

0.15456674473067916

Implementing Baye's theorem

$P(apple|ripen) = \frac{p(ripen|apple)*p(apple)}{p(ripen)}$

p_apple = df.loc['apple',:].sum()/sum(df.sum())
p_ripen_given_apple = df.loc['apple','ripen']/df.loc['apple',:].sum()
p_ripen = df.loc[:,'ripen'].sum()/sum(df.sum())

p_apple_given_ripen = (p_ripen_given_apple*p_apple)/p_ripen
p_apple_given_ripen

0.15456674473067916

Mini Assignment:

Generate a randum array of size 10 by 10 and convert it to pandas dataframe with column name and row name of your interest
Perform feature scaling (minimax scalar and standard scalar) by direct calculation of minimunm, maximum, mean, standard deviation from dataframe/array
Implement Scikit-learn preprocessing pipeline to perform feature scaling