程序代写代做代考 final-checkpoint
final-checkpoint
In [47]:
import pandas as pd
from sklearn import linear_model
directory = “accident/”
def readYear(year):
acci5 = pd.read_csv(directory + “DfTRoadSafety_Accidents_” + year + “.csv”, encoding=”utf-8-sig”)
casul5 = pd.read_csv(directory + “DfTRoadSafety_Casualties_” + year + “.csv”, encoding=”utf-8-sig”)
vehicle5 = pd.read_csv(directory + “DfTRoadSafety_Vehicles_” + year + “.csv”, encoding=”utf-8-sig”)
merged5 = pd.merge(acci5, vehicle5, on=’Accident_Index’, how=’inner’)
merged5 = pd.merge(merged5, casul5, on=’Accident_Index’, how=’inner’)
return merged5
def readAll():
t13 = readYear(“2013”)
t14 = readYear(“2014”)
t15 = readYear(“2015”)
merged5 = pd.concat([t13, t14, t15])
return merged5
In [48]:
year = “2013”
acci5 = pd.read_csv(directory + “DfTRoadSafety_Accidents_” + year + “.csv”, encoding=”utf-8-sig”)
casul5 = pd.read_csv(directory + “DfTRoadSafety_Casualties_” + year + “.csv”, encoding=”utf-8-sig”)
vehicle5 = pd.read_csv(directory + “DfTRoadSafety_Vehicles_” + year + “.csv”, encoding=”utf-8-sig”)
In [49]:
vehicle5
Out[49]:
Accident_Index Vehicle_Reference Vehicle_Type Towing_and_Articulation Vehicle_Manoeuvre Vehicle_Location-Restricted_Lane Junction_Location Skidding_and_Overturning Hit_Object_in_Carriageway Vehicle_Leaving_Carriageway … 1st_Point_of_Impact Was_Vehicle_Left_Hand_Drive? Journey_Purpose_of_Driver Sex_of_Driver Age_Band_of_Driver Engine_Capacity_(CC) Propulsion_Code Age_of_Vehicle Driver_IMD_Decile Driver_Home_Area_Type
0 201301BS70003 1 8 0 6 0 2 0 0 0 … 1 1 1 1 8 2402 2 10 8 1
1 201301BS70003 2 1 0 18 0 2 0 0 0 … 1 1 6 1 7 -1 -1 -1 8 1
2 201301BS70005 1 3 0 18 0 1 0 0 0 … 1 1 6 2 6 124 1 5 4 1
3 201301BS70006 1 11 0 14 0 1 0 0 0 … 0 1 1 1 8 -1 -1 -1 1 1
4 201301BS70007 1 3 0 4 0 8 0 0 0 … 4 1 6 1 7 124 1 7 4 1
5 201301BS70007 2 9 0 18 0 8 0 0 0 … 0 1 6 1 7 1985 1 10 4 1
6 201301BS70009 1 9 0 14 0 8 0 0 0 … 2 1 6 1 6 2494 1 8 5 1
7 201301BS70009 2 9 0 18 0 8 0 0 0 … 1 1 6 1 6 1995 2 13 5 1
8 201301BS70010 1 9 0 18 0 8 0 0 0 … 1 1 6 1 -1 2148 2 7 5 1
9 201301BS70012 1 8 0 6 0 8 0 0 0 … 3 1 1 1 9 2664 2 14 4 1
10 201301BS70012 2 5 0 18 0 8 0 0 0 … 1 1 6 1 7 1099 1 2 7 1
11 201301BS70013 1 1 0 7 0 8 0 0 0 … 1 1 6 1 5 -1 -1 -1 6 1
12 201301BS70015 1 9 0 16 0 8 0 0 0 … 0 1 6 3 -1 -1 -1 -1 -1 -1
13 201301BS70018 1 9 0 9 0 8 0 0 0 … 4 1 6 1 7 5461 1 1 4 1
14 201301BS70018 2 1 0 18 0 8 0 0 0 … 1 1 6 2 6 -1 -1 -1 7 1
15 201301BS70019 1 5 0 6 0 0 0 0 0 … 3 1 6 1 6 750 1 13 7 1
16 201301BS70019 2 19 0 18 0 0 0 0 0 … 1 1 1 1 8 2402 2 8 10 1
17 201301BS70020 1 1 0 18 0 8 0 0 0 … 3 1 6 2 6 -1 -1 -1 3 1
18 201301BS70020 2 9 0 9 0 8 0 0 0 … 1 1 6 2 -1 1598 1 5 -1 -1
19 201301BS70021 1 9 0 18 0 8 0 0 0 … 3 1 6 2 7 1242 1 1 -1 -1
20 201301BS70021 2 19 0 18 0 8 0 0 0 … 1 1 6 1 -1 -1 -1 -1 1 1
21 201301BS70023 1 3 0 4 0 8 0 0 0 … 2 1 6 1 6 124 1 2 2 1
22 201301BS70023 2 9 0 18 0 8 0 0 0 … 1 1 6 1 7 1995 2 6 5 1
23 201301BS70024 1 9 0 18 0 0 0 0 0 … 3 1 6 1 7 1800 1 6 2 1
24 201301BS70024 2 9 0 18 0 0 0 4 0 … 1 1 6 1 -1 2979 1 10 4 1
25 201301BS70024 3 9 0 2 0 0 0 0 0 … 1 1 6 1 -1 1389 1 16 -1 -1
26 201301BS70024 4 9 0 2 0 0 0 0 0 … 1 1 6 1 -1 1560 2 8 8 1
27 201301BS70024 5 9 0 2 0 0 0 0 0 … 1 1 6 1 -1 1995 1 2 8 1
28 201301BS70025 1 5 0 18 0 8 0 0 0 … 1 1 6 1 5 599 1 3 5 1
29 201301BS70027 1 9 0 18 0 8 0 0 0 … 1 1 6 3 -1 -1 -1 -1 -1 -1
… … … … … … … … … … … … … … … … … … … … … …
252883 2013984132713 1 9 0 18 0 0 0 0 0 … 1 1 1 1 5 1229 1 -1 8 2
252884 2013984132713 2 9 0 18 0 0 1 0 0 … 2 1 6 1 9 1329 1 2 10 2
252885 2013984133213 1 9 0 17 0 0 2 0 1 … 1 1 6 1 5 998 1 10 -1 2
252886 2013984133413 1 9 0 11 0 0 5 6 2 … 1 1 6 1 7 1998 1 9 -1 1
252887 2013984133713 1 9 0 18 0 6 0 0 3 … 1 1 6 1 4 1870 2 2 -1 -1
252888 2013984133713 2 9 0 18 0 1 0 0 1 … 3 1 2 2 8 1398 2 -1 -1 3
252889 2013984134713 1 9 0 18 0 0 1 0 2 … 3 1 2 1 6 1248 2 2 6 1
252890 2013984134813 1 9 0 18 0 0 0 0 2 … 1 1 6 1 8 1896 2 8 3 1
252891 2013984135613 1 9 0 17 0 0 0 0 1 … 1 1 1 1 6 1296 1 7 5 1
252892 2013984136513 1 20 0 18 0 0 0 0 0 … 0 1 1 3 -1 -1 -1 -1 -1 -1
252893 2013984136713 1 9 0 1 0 8 0 0 0 … 2 1 6 1 7 1399 2 5 -1 3
252894 2013984138613 1 21 0 7 0 2 0 0 0 … 1 1 1 1 8 -1 -1 -1 -1 2
252895 2013984138613 2 9 0 7 0 2 0 0 0 … 2 1 6 2 8 1390 1 7 4 1
252896 2013984138913 1 19 0 17 0 0 0 0 1 … 1 1 1 1 8 1560 2 1 -1 3
252897 2013984139013 1 21 1 18 0 0 5 0 0 … 4 1 1 1 9 -1 -1 -1 -1 -1
252898 2013984139913 1 19 0 18 0 0 0 12 0 … 1 1 1 1 7 2402 2 7 2 1
252899 2013984141213 1 9 0 9 0 6 0 0 3 … 3 1 6 1 5 1242 1 9 10 1
252900 2013984141213 2 9 0 18 0 1 0 0 7 … 1 1 6 1 7 2993 2 5 9 1
252901 2013984141313 1 9 0 16 0 0 0 0 5 … 1 1 6 1 4 1998 1 22 -1 2
252902 2013984141713 1 9 0 9 0 1 0 0 0 … 0 1 6 1 11 1686 2 19 -1 3
252903 2013984141713 2 9 0 18 0 1 0 0 0 … 1 1 6 1 6 1560 2 5 -1 3
252904 2013984141713 3 9 0 10 0 1 0 0 0 … 1 1 6 1 4 1598 1 7 -1 3
252905 2013984142113 1 9 0 9 0 6 0 0 0 … 1 1 6 2 5 1242 1 -1 -1 2
252906 2013984142113 2 9 0 18 0 8 0 0 0 … 4 1 6 2 9 1124 1 11 -1 3
252907 2013984142213 1 9 0 9 0 1 0 0 0 … 4 1 6 3 -1 -1 -1 -1 -1 -1
252908 2013984142413 1 9 0 18 0 8 0 0 7 … 1 1 6 1 11 2400 2 3 -1 3
252909 2013984142413 2 9 0 18 0 8 0 0 7 … 2 1 6 1 8 1560 2 7 -1 1
252910 2013984142413 3 21 0 18 0 1 0 0 0 … 0 1 1 1 9 12777 2 5 7 3
252911 2013984142813 1 9 0 18 0 0 1 0 0 … 3 1 6 1 6 1598 2 3 9 1
252912 2013984142813 2 9 0 18 0 0 0 0 0 … 1 1 6 1 7 1870 2 9 2 1
252913 rows × 21 columns
In [50]:
t13 = readYear(“2013”)
In [ ]:
In [51]:
def run():
merged5 = readAll()
X = merged5.drop(‘Casualty_Severity’, 1).drop(‘Accident_Index’, 1).drop(‘Vehicle_Reference_x’, 1).drop(‘Vehicle_Reference_y’,
1).drop(‘Casualty_Reference’, 1)
lc = [u’Date’,
u’Time’,
u’Local_Authority_(Highway)’,
u’LSOA_of_Accident_Location’]
for c in lc:
X = X.drop(c, 1)
X = X.fillna(X.mean())
Y = merged5[‘Casualty_Severity’]
logreg = linear_model.LogisticRegression(C=1e5, class_weight = {1:100, 2:10, 3:1})
logreg.fit(X, Y)
res = logreg.predict(X)
print([sum(res == 1), sum(res == 2), sum(res == 3)])
print(sum(res == Y)*1.0 / len(Y))
t = abs(logreg.coef_)
cosum = sum(t, 0)
model_coefficient = pd.Series(cosum, index=X.columns)
top20 = model_coefficient.sort_values(ascending=False).head(20)
print(model_coefficient.shape)
print(top20)
outD = pd.DataFrame({“pred”: res, “correct”: Y})
outD.to_csv(“compare.csv”, index=False, header=True)
In [52]:
merged5 = readAll()
X = merged5.drop(‘Casualty_Severity’, 1).drop(‘Accident_Index’, 1).drop(‘Vehicle_Reference_x’, 1).drop(‘Vehicle_Reference_y’,
1).drop(‘Casualty_Reference’, 1)
lc = [u’Date’,
u’Time’,
u’Local_Authority_(Highway)’,
u’LSOA_of_Accident_Location’]
for c in lc:
X = X.drop(c, 1)
X = X.fillna(X.mean())
Y = merged5[‘Casualty_Severity’]
In [53]:
# Y = Y.map(int)
arr = []
for t in Y:
if t != 1 and t != 2 and t != 3:
print t
arr.append(int(t))
print(len(arr))
print(Y.shape)
# Y = pd.to_numeric(Y)
# Y = merged5[‘Casualty_Severity’]
# Y
0
(1055110,)
In [55]:
merged5.columns.to_series().groupby(merged5.dtypes).groups
Out[55]:
{dtype(‘int64′): [u’1st_Point_of_Impact’,
u’1st_Road_Class’,
u’1st_Road_Number’,
u’2nd_Road_Class’,
u’2nd_Road_Number’,
u’Accident_Severity’,
u’Age_Band_of_Casualty’,
u’Age_Band_of_Driver’,
u’Age_of_Vehicle’,
u’Bus_or_Coach_Passenger’,
u’Car_Passenger’,
u’Carriageway_Hazards’,
u’Casualty_Class’,
u’Casualty_Home_Area_Type’,
u’Casualty_Reference’,
u’Casualty_Severity’,
u’Casualty_Type’,
u’Day_of_Week’,
u’Did_Police_Officer_Attend_Scene_of_Accident’,
u’Driver_Home_Area_Type’,
u’Driver_IMD_Decile’,
u’Engine_Capacity_(CC)’,
u’Hit_Object_in_Carriageway’,
u’Hit_Object_off_Carriageway’,
u’Journey_Purpose_of_Driver’,
u’Junction_Control’,
u’Junction_Detail’,
u’Junction_Location’,
u’Light_Conditions’,
u’Local_Authority_(District)’,
u’Number_of_Casualties’,
u’Number_of_Vehicles’,
u’Pedestrian_Crossing-Human_Control’,
u’Pedestrian_Crossing-Physical_Facilities’,
u’Pedestrian_Location’,
u’Pedestrian_Movement’,
u’Pedestrian_Road_Maintenance_Worker’,
u’Police_Force’,
u’Propulsion_Code’,
u’Road_Surface_Conditions’,
u’Road_Type’,
u’Sex_of_Casualty’,
u’Sex_of_Driver’,
u’Skidding_and_Overturning’,
u’Special_Conditions_at_Site’,
u’Speed_limit’,
u’Towing_and_Articulation’,
u’Urban_or_Rural_Area’,
u’Vehicle_Leaving_Carriageway’,
u’Vehicle_Location-Restricted_Lane’,
u’Vehicle_Manoeuvre’,
u’Vehicle_Reference_x’,
u’Vehicle_Reference_y’,
u’Vehicle_Type’,
u’Was_Vehicle_Left_Hand_Drive?’,
u’Weather_Conditions’],
dtype(‘float64′): [u’Age_of_Casualty’,
u’Age_of_Driver’,
u’Latitude’,
u’Location_Easting_OSGR’,
u’Location_Northing_OSGR’,
u’Longitude’],
dtype(‘O’): [u’Accident_Index’,
u’Date’,
u’LSOA_of_Accident_Location’,
u’Local_Authority_(Highway)’,
u’Time’]}
In [58]:
print [sum(Y == 1), sum(Y == 2), sum(Y == 3)]
print sum(Y == 3)*1.0 / len(Y)
[9095, 111143, 934872]
0.886042213608
In [60]:
logreg = linear_model.LogisticRegression(C=1e5, class_weight = {1:1000, 2:10, 3:1})
logreg.fit(X, Y)
res = logreg.predict(X)
print([sum(res == 1), sum(res == 2), sum(res == 3)])
print(sum(res == Y)*1.0 / len(Y))
[1274, 120124, 933712]
0.939066068941
In [63]:
t = abs(logreg.coef_)
cosum = sum(t, 0)
model_coefficient = pd.Series(cosum, index=X.columns)
top10 = model_coefficient.sort_values(ascending=False).head(10)
print(model_coefficient.shape)
print(top10)
outD = pd.DataFrame({“pred”: res, “correct”: Y})
outD.to_csv(“compare.csv”, index=False, header=True)
(58,)
Accident_Severity 7.743353
Sex_of_Casualty 0.457001
Did_Police_Officer_Attend_Scene_of_Accident 0.363090
Latitude 0.338320
Car_Passenger 0.317477
Urban_or_Rural_Area 0.144427
Age_Band_of_Casualty 0.137477
Pedestrian_Location 0.134923
Road_Type 0.121066
Bus_or_Coach_Passenger 0.117226
dtype: float64
In [ ]:
In [ ]: