程序代写代做代考 matlab In [2]:
In [2]:
import scipy.io as sio
import pandas as pd
import numpy as np
In [3]:
arr = sio.loadmat(‘/Users/vagrant/tasks-2017/wangwang-pingdan_21/matlab-1600/a
ssignment1/asg1-2017/DataA.mat’)
In [4]:
fea = arr[‘fea’]
In [5]:
fea.shape
In [6]:
fea
In [7]:
df = pd.DataFrame(fea);
In [8]:
df.to_csv(‘fea.csv’)
In [9]:
df[df.isnull().any(axis=1)].shape
Out[5]:
(19000, 81)
Out[6]:
array([[-153., 414., 939., …, -29., 36., 24.],
[-150., 420., 939., …, -31., 47., 3.],
[-160., 432., 941., …, -38., 20., 0.],
…,
[ nan, nan, nan, …, nan, nan, nan],
[ nan, nan, nan, …, nan, nan, nan],
[ nan, nan, nan, …, nan, nan, nan]])
Out[9]:
(19000, 81)
In [29]:
df[df.isnull().all(axis=1)].shape
I. I. Data Cleaning and Preprocessing (for dataset A)
1
The dataset has 19000 rows and 81 columns. All rows have at least 1 missing value. There are 773 rows
whose 81 values are all missing.
2
I remove the 773 rows whose 81 values are all missing and fill other missing value with the mean value of
that column.
In [28]:
dfAfterRemoveEmptyRows = df.ix[~(df.isnull().all(axis=1))]
processed = dfAfterRemoveEmptyRows.fillna(df.mean())
processed
Out[29]:
(773, 81)
Out[28]:
0 1 2 3 4 5
0 -153.000000 414.000000 939.000000 -161.000000 1007.000000 99.000000
1 -150.000000 420.000000 939.000000 -177.000000 1008.000000 103.000000
2 -160.000000 432.000000 941.000000 -162.000000 982.000000 98.000000
3 -171.000000 432.000000 911.000000 -174.000000 999.000000 115.000000
4 -171.000000 698.264485 929.000000 -189.000000 1004.000000 104.000000
5 -171.000000 432.000000 924.000000 -179.000000 1011.000000 85.000000
6 -169.000000 429.000000 949.000000 -175.000000 1007.000000 102.000000
7 -160.000000 423.000000 927.000000 -195.000000 996.000000 123.000000
8 -163.000000 432.000000 929.000000 -178.000000 994.000000 101.000000
9 -156.000000 415.000000 936.000000 -186.000000 1014.000000 111.000000
10 -153.000000 413.000000 923.000000 -187.000000 993.000000 91.000000
11 -168.000000 412.000000 904.000000 -194.000000 989.000000 115.000000
12 -166.000000 442.000000 926.000000 -191.000000 1001.000000 114.000000
13 -162.000000 447.000000 920.000000 -218.000000 1000.000000 110.000000
14 -184.000000 442.000000 941.000000 -237.000000 992.000000 144.000000
15 -157.000000 427.000000 925.000000 -245.000000 986.000000 127.000000
16 -158.000000 427.000000 905.000000 -218.000000 990.000000 111.000000
17 -153.000000 451.000000 889.000000 -260.000000 967.000000 112.000000
18 -150.000000 443.000000 928.000000 -243.000000 968.000000 130.000000
19 -151.000000 442.000000 930.000000 -260.000000 991.000000 92.000000
20 -153.000000 462.000000 940.000000 -253.000000 977.000000 135.000000
21 -170.000000 459.000000 927.000000 -247.000000 959.000000 133.000000
22 -148.000000 453.000000 923.000000 -274.000000 992.000000 142.000000
23 -154.000000 449.000000 928.000000 -274.000000 948.000000 115.000000
24 -165.000000 462.000000 918.000000 -302.000000 974.000000 129.000000
25 -187.000000 479.000000 926.000000 -304.000000 966.000000 122.000000
26 -169.000000 473.000000 947.000000 -305.000000 965.000000 123.000000
27 -172.000000 482.000000 941.000000 -314.000000 946.000000 163.000000
28 -203.000000 486.000000 917.000000 -314.000000 972.000000 145.000000
29 -200.000000 491.000000 898.000000 -312.000000 978.000000 123.000000
… … … … … … …
18197 -132.812384 698.264485 597.541402 -102.000000 967.000000 73.000000
18198 -132.812384 698.264485 597.541402 -106.000000 1130.000000 85.000000
18199 -132.812384 698.264485 597.541402 0.000000 0.000000 0.000000
18200 -132.812384 698.264485 597.541402 -307.128462 909.548077 -32.760824
18201 -132.812384 698.264485 597.541402 -307.128462 909.548077 -32.760824
18202 -132.812384 698.264485 597.541402 -307.128462 909.548077 -32.760824
18203 -132.812384 698.264485 597.541402 -307.128462 909.548077 -32.760824
18204 -132.812384 698.264485 597.541402 -307.128462 909.548077 -32.760824
18205 -132.812384 698.264485 597.541402 -307.128462 909.548077 -32.760824
18206 -132.812384 698.264485 597.541402 -307.128462 909.548077 -32.760824
18207 -132.812384 698.264485 597.541402 -307.128462 909.548077 -32.760824
18208 -132.812384 698.264485 597.541402 -307.128462 909.548077 -32.760824
18209 -132.812384 698.264485 597.541402 -307.128462 909.548077 -32.760824
18210 -132.812384 698.264485 597.541402 -307.128462 909.548077 -32.760824
18211 -132.812384 698.264485 597.541402 -307.128462 909.548077 -32.760824
18212 -132.812384 698.264485 597.541402 -307.128462 909.548077 -32.760824
18213 -132.812384 698.264485 597.541402 -307.128462 909.548077 -32.760824
In [48]:
from sklearn import preprocessing
minMaxScaler = preprocessing.MinMaxScaler()
minMaxScaledDf = pd.DataFrame(minMaxScaler.fit_transform(processed))
zscoreScaler = preprocessing.StandardScaler()
zscoreDf = pd.DataFrame(zscoreScaler.fit_transform(processed))
zscoreDf = pd.DataFrame(preprocessing.scale(processed))
minMaxScaledDf
18214 -132.812384 698.264485 597.541402 -307.128462 909.548077 -32.760824
18215 -132.812384 698.264485 597.541402 -307.128462 909.548077 -32.760824
18216 -132.812384 698.264485 597.541402 -307.128462 909.548077 -32.760824
18217 -132.812384 698.264485 597.541402 -307.128462 909.548077 -32.760824
18218 -132.812384 698.264485 597.541402 -307.128462 909.548077 -32.760824
18219 -132.812384 698.264485 597.541402 -307.128462 909.548077 -32.760824
18220 -132.812384 698.264485 597.541402 -307.128462 909.548077 -32.760824
18221 -132.812384 698.264485 597.541402 -307.128462 909.548077 -32.760824
18222 -132.812384 698.264485 597.541402 -307.128462 909.548077 -32.760824
18223 -132.812384 698.264485 597.541402 -307.128462 909.548077 -32.760824
18224 -132.812384 698.264485 597.541402 -307.128462 909.548077 -32.760824
18225 -132.812384 698.264485 597.541402 -307.128462 909.548077 -32.760824
18226 -132.812384 698.264485 597.541402 -307.128462 909.548077 -32.760824
18227 rows × 81 columns
In [51]:
import matplotlib.pyplot as plt
for featureInd in (9,24):
# featureInd = 9
processed.ix[:, featureInd – 1].hist()
plt.title(‘Feature %d histogram before normalization’ % featureInd)
plt.show()
plt.figure()
minMaxScaledDf.ix[:, featureInd – 1].hist()
plt.title(‘Feature %d histogram after min-max normalization’ % featureInd)
plt.show()
plt.figure()
zscoreDf.ix[:, featureInd – 1].hist()
plt.title(‘Feature %d histogram after z-score normalization’ % featureInd)
plt.show()
In [61]:
from pandas.tools.plotting import autocorrelation_plot
for featureInd in (9,24):
autocorrelation_plot(processed.ix[:, featureInd-1])
plt.title(‘Feature %d auto-correlation plot before normalization’ % featur
eInd)
plt.show()
autocorrelation_plot( minMaxScaledDf.ix[:, featureInd – 1])
plt.title(‘Feature %d auto-correlation plot after min-max normalization’ %
featureInd)
plt.show()
autocorrelation_plot( zscoreDf.ix[:, featureInd – 1])
plt.title(‘Feature %d auto-correlation plot after z-score normalization’ %
featureInd)
plt.show()
II. Feature Extraction (for dataset B)
1. Use PCA as a dimensionality reduction technique to the
data, compute the eigenvectors and eigenvalues.
In [65]:
arr = sio.loadmat(‘/Users/vagrant/tasks-2017/wangwang-pingdan_21/matlab-1600/a
ssignment1/asg1-2017/DataB.mat’)
arr
In [66]:
feaDf = pd.DataFrame(arr[‘fea’]);
gndDf = pd.DataFrame(arr[‘gnd’]);
In [88]:
from sklearn.decomposition import PCA
pca = PCA()
trans = pca.fit(feaDf).transform(feaDf)
trans = pd.DataFrame(trans)
trans
Out[65]:
{‘__globals__’: [],
‘__header__’: ‘MATLAB 5.0 MAT-file, Platform: PCWIN64, Created on
: Wed Sep 24 09:53:02 2014’,
‘__version__’: ‘1.0’,
‘fea’: array([[4, 4, 3, …, 1, 4, 5],
[5, 1, 4, …, 3, 5, 4],
[1, 3, 0, …, 1, 2, 4],
…,
[2, 3, 2, …, 1, 1, 3],
[5, 2, 4, …, 4, 3, 4],
[3, 3, 1, …, 1, 3, 1]], dtype=uint8),
‘gnd’: array([[0],
[0],
[0],
…,
[4],
[4],
[4]], dtype=uint8)}
Out[88]:
0 1 2 3 4 5
0 -1069.166304 -513.973184 -139.243261 878.387704 387.873484 -335.304982
1 -1099.176077 -570.842223 -67.311779 839.381070 345.573249 -530.737220
2 -673.201385 -167.377150 480.988638 83.823068 1036.833666 76.531663
3 -1010.903339 -187.044145 506.352247 426.446929 901.897549 73.661148
4 -1692.970822 -633.369398 -521.943052 367.356716 -6.919257 -601.851221
5 -1341.694310 -536.770010 -578.489504 246.425866 -577.933576 140.191582
6 -1217.832623 -521.312900 -116.355397 240.552050 680.648563 -501.999433
7 -226.761138 -457.324861 -284.321899 128.615747 -1017.001492 -87.023369
8 -1219.222302 -479.274443 8.018715 -521.169429 722.007449 195.983302
9 -900.753262 -548.126694 30.258637 46.963719 761.675828 -435.396163
10 -1119.444673 -430.540486 -4.091402 150.580695 1045.936651 -171.263259
11 -1132.061141 -688.199951 -534.771525 -8.874299 -914.504627 85.645282
12 -1283.397863 -396.151025 187.789686 40.171936 787.148009 63.436551
13 -431.797862 -454.763166 -364.204768 166.199142 -1016.920930 280.651043
14 -1368.131762 -641.515614 -732.070197 619.218875 -244.318899 -49.976603
15 -784.250675 -419.946331 -589.587363 470.526795 -85.742394 -123.237859
16 -1455.821769 -519.039116 109.383460 961.047935 647.104030 -564.693429
17 -1406.742684 -1010.360551 -236.296620 302.702769 -1052.810187 99.689728
18 -809.569532 -355.676642 -553.061747 739.156447 -342.576671 -298.188245
19 -1009.720480 -500.758620 148.698733 -206.740212 942.652635 179.051437
20 -1256.737747 -136.282075 4.767105 401.171633 111.170531 20.126463
21 -1097.738331 -791.366616 -662.612802 416.147079 -787.374203 -393.630293
22 -1237.245760 -1011.960330 -384.654272 69.297448 -1141.412452 115.369106
23 -337.006350 -166.084273 -519.665873 130.820076 -403.671160 -213.609312
24 -622.921863 -177.985457 -414.160463 -401.750020 2.516656 192.630114
25 -1058.479796 -679.243567 -567.416761 49.160913 -884.302279 17.978966
26 -170.420127 205.716202 -134.305883 -28.439419 90.064382 81.864711
27 -937.063555 -120.700099 -159.267905 29.237288 -114.534026 -366.615037
28 -232.650631 -283.345693 -543.440591 257.837187 -835.285216 -363.820625
29 -899.559652 -780.081817 -483.711259 169.414276 -1077.021039 151.471500
… … … … … … …
2036 224.500021 717.688537 -195.952977 -499.323792 -267.286036 -368.337387
2037 -435.315514 1060.086518 -252.240653 -784.207808 17.898867 -426.553471
2038 238.402523 821.431429 -296.668932 72.066998 302.033654 -169.441235
2039 479.799398 304.242620 -530.251819 -263.518464 -18.481148 -269.480921
2040 -85.398938 899.484460 -359.165698 -478.939958 30.973590 -421.339300
2041 -975.896906 891.969722 -658.015458 -273.613739 -529.811171 -94.804905
2042 -626.495241 838.788408 -639.081008 212.654848 -340.914705 80.270632
2043 258.028617 579.227378 -335.781308 -127.985693 149.696587 -147.257143
2044 252.991333 361.106673 -45.994668 -275.875656 123.818071 46.475332
2045 -43.933473 596.114295 -360.973687 -150.061260 152.710758 44.882070
In [78]:
feaDf[gndDf.ix[:,0] == 1]
gndDf.max()
2046 8.716199 710.432025 -23.709776 -105.468206 -499.729993 -119.021552
2047 432.475888 548.321160 -174.616090 99.150459 -3.700452 -295.330003
2048 -164.177255 959.226629 -239.753319 -546.282908 -434.398609 -269.636934
2049 19.806668 944.896452 -408.627564 -141.635179 282.084948 -27.051497
2050 260.650271 726.328779 -374.089981 326.390047 -251.397107 -172.193151
2051 337.640613 568.111744 -358.456523 -207.139564 -262.826142 -492.078627
2052 111.635685 782.005670 -337.485075 -404.216222 158.940143 -295.307423
2053 575.789733 321.964107 193.327055 -4.403837 546.809983 104.206169
2054 -56.731706 983.452324 -101.364217 -575.686798 191.904316 -464.280977
2055 391.142965 529.109716 -360.665581 34.201602 227.983142 -82.175911
2056 222.286099 790.011945 -279.594895 -583.068983 -311.732657 -212.255003
2057 432.180822 776.933460 49.309403 -306.113951 153.785454 -115.321890
2058 -141.001824 566.194886 -347.955180 69.169126 -373.710178 241.262334
2059 -467.623465 718.782394 -152.832548 -135.848772 344.854140 76.905198
2060 -514.275826 1049.482202 -256.796813 -906.542691 -121.841814 -276.421567
2061 24.355662 742.490057 -467.186538 -255.047804 -157.302478 -239.561076
2062 -48.768593 734.458335 -334.353122 -496.721083 117.983630 -467.477372
2063 -131.021601 866.607035 -397.861565 -248.089962 45.492451 -93.904547
2064 262.141229 652.777351 -347.602739 72.427962 -80.070774 -164.182531
2065 480.891094 432.743142 18.124027 -364.053056 502.566777 428.276701
2066 rows × 784 columns
Out[78]:
0 4
dtype: uint8
In [89]:
colors = [‘navy’, ‘turquoise’, ‘darkorange’, ‘red’, ‘green’]
classes = [0, 1, 2, 3, 4]
classesStr = [‘class ‘ + str(i) for i in classes]
for color, i, class_name in zip(colors, classes, classesStr):
t = trans[gndDf.ix[:,0] == i]
plt.scatter(t.ix[:,0], t.ix[:,1], color=color, alpha=.8,
label=class_name)
plt.legend(loc=’best’, shadow=False, scatterpoints=1)
plt.title(‘2 dimensional representation with first and second principal compon
ents’)
plt.show()
In [90]:
for color, i, class_name in zip(colors, classes, classesStr):
t = trans[gndDf.ix[:,0] == i]
plt.scatter(t.ix[:,4], t.ix[:,5], color=color, alpha=.8,
label=class_name)
plt.legend(loc=’best’, shadow=False, scatterpoints=1)
plt.title(‘2 dimensional representation with first and second principal compon
ents’)
plt.show()
In [101]:
from sklearn.naive_bayes import GaussianNB
class_errors = []
retainedVars = []
for n in [2, 4, 10, 30, 60, 200, 500,784]:
gnb = GaussianNB()
y_pred = gnb.fit(trans.ix[:,:n], gndDf.ix[:,0]).predict(trans.ix[:,:n])
class_error = sum(y_pred != gndDf.ix[:,0])*1.0 / y_pred.shape[0]
class_errors.append(class_error)
retainedVars.append(sum(pca.explained_variance_[:n]) / sum(pca.explained_v
ariance_))
plt.plot(retainedVars, class_errors)
plt.xlabel(‘retained variance’)
plt.ylabel(‘classication error’)
plt.title(‘Naive Bayes Classification Error’)
plt.show()
In [102]:
In [103]:
In [ ]:
Out[102]:
[0.17328170377541141,
0.10067763794772508,
0.0701839303000968,
0.059535333978702809,
0.047918683446272994,
0.059535333978702809,
0.19070667957405615,
0.23233301064859632]
Out[103]:
[0.2208652726402634,
0.34489263092230382,
0.54222178827782175,
0.76145556146398985,
0.87275854960645693,
0.97511626228772696,
0.99975130751258978,
1.0]