In [1]:
import pandas as pd
import numpy as np
import pickle
import pdb

import plotly
import plotly.graph_objs as go
from plotly.offline import download_plotlyjs, init_notebook_mode, plot, iplot

init_notebook_mode(connected=True)

pd.options.display.max_rows = 100
pd.options.display.max_columns = 100

import pandas_profiling as pp

from IPython.display import Image
from IPython.core.display import HTML

from sklearn.feature_selection import SelectFromModel
from sklearn.feature_selection import SelectKBest, chi2
from sklearn.linear_model import RidgeClassifier
from sklearn.pipeline import Pipeline
from sklearn.svm import LinearSVC
from sklearn.linear_model import SGDClassifier
from sklearn.linear_model import Perceptron
from sklearn.linear_model import PassiveAggressiveClassifier
from sklearn.naive_bayes import BernoulliNB, ComplementNB, MultinomialNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn.neighbors import NearestCentroid
from sklearn.ensemble import RandomForestClassifier
from sklearn.utils.extmath import density
from sklearn import metrics
from sklearn.model_selection import StratifiedKFold
from sklearn.feature_selection import RFECV
from sklearn.metrics import roc_curve
from sklearn.metrics import precision_recall_curve


from sklearn.model_selection import train_test_split
from time import time
import matplotlib.pyplot as plt


import lime
import warnings
warnings.filterwarnings('ignore')

Machine Learning - Theory and Practice

  • Supervised learining
    • Classification
    • Regression
  • Unsupoervised learning
    • Clustering
  • Dimensionality reduction
  • Anomaly detection
  • Reinforcement learning
In [2]:
Image(url= "figures/CRISP-DM_Process_Diagram.png", width=600, height=600)
Out[2]:

Source: Kenneth Jensen - Own work based on: ftp://public.dhe.ibm.com/software/analytics/spss/documentation/modeler/18.0/en/ModelerCRISPDM.pdf 

Business understanding

Data Analytics for Railroad Empty-to-Load Peak Kips Prediction - RAS Problem Solving Competion

Background

  • Peak kips denotes the total vertical force a wheel imposes on the rail
  • Detected using Wheel Impact Load Detectors
  • The higher the peak kips, the more damage will be imposed to both the wheels and tracks
  • Railroads use this measure to detect whether there are any defects in either the tracks or the wheels
  • An alarm will be triggered if the peak kips value is above or equal to 90, and the involved cars need to be set out for inspection and repair

Problem definition

  1. Predict the actual peak kips value at the next loaded status. The desired solution should be for the difference between actual and predicted kips value to be within +/- 2 kips for 90% of observations in the test dataset;
  2. Predict whether any alarm will be triggered at the next loaded status. We will consider any peak kips reading ≥ 90 as a positive event (“an alarm”). The evaluation criterion is getting true positive rate as high as possible while minimizing the false positive rate in the test dataset.
    • Predict the next loaded status within 10 days
    • Assume we will be only interested in predicting the loaded status with the immediate previous status being empty (i.e., E -> L);

Data Understanding

  • Equipment-related. This includes (but not limited to):

    • eqp_grs_tons - weight of the equipment
    • edr_eqp_spd - speed of the equipment
    • aar_ct_c - equipment type
    • grs_rail_wgt - wheel type
    • whl_peak_kips - previous peak kips readings of the wheel
    • whl_avg_kips - previous average kips readings of the wheel
    • whl_dyn_kips - previous dynamic kips readings of the wheel
    • whl_dyn_ratio - previous dynamic ratio of the wheel
    • age - age of the wheel
    • axle_side and eqp_axle_nbr - location of the wheel in the equipment
  • Location-related:

    • vndr_edr_nme - location of the detectors
  • Time-related:
    • Month/year of the readings
  • Vendor-related:
    • edr_vndr - Vendor of the WILD detectors
In [3]:
# Load data
df_train = pd.read_csv(r'Ras Problem Training Data.tab',sep ='\t')
df_train.head()
Out[3]:
unq_whl_id LOAD_EMPTY trn_id vnder_edr_ts_cst VNDR_EDR_NME EDR_VNDR AGE AXLE_SIDE EQP_AXLE_NBR EQP_SEQ_NBR WHL_AVG_KIPS WHL_PEAK_KIPS WHL_DYN_KIPS WHL_DYN_RATIO EDR_EQP_SPD EQP_GRS_TONS TARE AAR_CT_C EQP_GRP GRS_RAIL_WGT KIPDAYS RPR_WHY_CD
0 EQVI503147L7-2011-05-28::2014-03-08 L SOSFVFR229K 01JUN2013:03:01:00 BAGD SALIENT 735 L 7.0 10.0 36.35 40.24 3.89 1.11 41.57 345.6 94.0 S162 IFLT 800000.0 -999.00 11.0
1 EQVI717414R1-2013-05-15::2013-09-17 E XSJDVPQ914G 15AUG2013:04:55:00 COCH SALIENT 92 R 1.0 73.0 7.23 10.81 3.58 1.50 56.95 30.4 30.0 C114 HOPP 286000.0 -999.00 11.0
2 EQVI587556R3-2013-05-13::2014-08-12 L QDWJODF329A 03DEC2013:05:48:00 BAGD SALIENT 204 R 3.0 4.0 32.47 46.97 14.50 1.45 45.79 194.2 65.0 S635 IFLT 487000.0 117.58 65.0
3 GHWA626L4-2010-07-22::2014-08-28 L CVFPVXG020A 26FEB2014:17:58:00 BMRK SALIENT 1315 L 4.0 103.0 32.13 36.99 4.86 1.15 21.78 138.6 21.0 J311 GOND 286000.0 368.73 11.0
4 WLOA76475R3-2011-06-21::2014-06-21 E EWKKQDP068A 02MAY2014:04:36:00 AURO SALIENT 1046 R 3.0 77.0 5.53 7.84 2.31 1.42 47.51 23.0 20.0 J311 GOND 286000.0 -999.00 64.0

Data Preparation

Data types

In [4]:
df_train.dtypes
Out[4]:
unq_whl_id           object
LOAD_EMPTY           object
trn_id               object
vnder_edr_ts_cst     object
VNDR_EDR_NME         object
EDR_VNDR             object
AGE                   int64
AXLE_SIDE            object
EQP_AXLE_NBR        float64
EQP_SEQ_NBR         float64
WHL_AVG_KIPS        float64
WHL_PEAK_KIPS       float64
WHL_DYN_KIPS        float64
WHL_DYN_RATIO       float64
EDR_EQP_SPD         float64
EQP_GRS_TONS        float64
TARE                float64
AAR_CT_C             object
EQP_GRP              object
GRS_RAIL_WGT        float64
KIPDAYS             float64
RPR_WHY_CD          float64
dtype: object
In [5]:
df_train.vnder_edr_ts_cst = pd.to_datetime(df_train.vnder_edr_ts_cst, format='%d%b%Y:%H:%M:%S')

Data profile

In [6]:
pp.ProfileReport(df_train, check_correlation=True)
Out[6]:

Overview

Dataset info

Number of variables 22
Number of observations 7032197
Total Missing (%) 0.0%
Total size in memory 1.2 GiB
Average record size in memory 176.0 B

Variables types

Numeric 12
Categorical 8
Boolean 0
Date 1
Text (Unique) 0
Rejected 1
Unsupported 0

Warnings

  • AAR_CT_C has a high cardinality: 386 distinct values Warning
  • GRS_RAIL_WGT is highly correlated with TARE (ρ = 0.9196) Rejected
  • trn_id has a high cardinality: 115887 distinct values Warning
  • unq_whl_id has a high cardinality: 113422 distinct values Warning

Variables

AAR_CT_C
Categorical

Distinct count 386
Unique (%) 0.0%
Missing (%) 0.0%
Missing (n) 1
J311
2532309
C114
963453
S635
606598
Other values (382)
2929836
Value Count Frequency (%)  
J311 2532309 36.0%
 
C114 963453 13.7%
 
S635 606598 8.6%
 
S162 569073 8.1%
 
K341 467251 6.6%
 
T108 198021 2.8%
 
K346 179110 2.5%
 
C113 125869 1.8%
 
J301 110263 1.6%
 
F483 78154 1.1%
 
Other values (375) 1202095 17.1%
 

AGE
Numeric

Distinct count 2525
Unique (%) 0.0%
Missing (%) 0.0%
Missing (n) 0
Infinite (%) 0.0%
Infinite (n) 0
Mean 844.29
Minimum 0
Maximum 2540
Zeros (%) 0.0%

Quantile statistics

Minimum 0
5-th percentile 209
Q1 592
Median 857
Q3 1044
95-th percentile 1586
Maximum 2540
Range 2540
Interquartile range 452

Descriptive statistics

Standard deviation 389.29
Coef of variation 0.46109
Kurtosis 0.77297
Mean 844.29
MAD 295.54
Skewness 0.47535
Sum 5937222145
Variance 151550
Memory size 53.7 MiB
Value Count Frequency (%)  
1093 17999 0.3%
 
1092 17196 0.2%
 
1094 16356 0.2%
 
1091 15909 0.2%
 
1090 15287 0.2%
 
1089 14757 0.2%
 
1088 14143 0.2%
 
1087 13651 0.2%
 
1086 13515 0.2%
 
1081 13312 0.2%
 
Other values (2515) 6880072 97.8%
 

Minimum 5 values

Value Count Frequency (%)  
0 268 0.0%
 
1 474 0.0%
 
2 938 0.0%
 
3 1171 0.0%
 
4 1217 0.0%
 

Maximum 5 values

Value Count Frequency (%)  
2528 2 0.0%
 
2531 1 0.0%
 
2532 1 0.0%
 
2533 1 0.0%
 
2540 1 0.0%
 

AXLE_SIDE
Categorical

Distinct count 2
Unique (%) 0.0%
Missing (%) 0.0%
Missing (n) 0
R
3517480
L
3514717
Value Count Frequency (%)  
R 3517480 50.0%
 
L 3514717 50.0%
 

EDR_EQP_SPD
Numeric

Distinct count 7478
Unique (%) 0.1%
Missing (%) 0.0%
Missing (n) 1
Infinite (%) 0.0%
Infinite (n) 0
Mean 41.325
Minimum 0
Maximum 464.97
Zeros (%) 0.3%

Quantile statistics

Minimum 0
5-th percentile 22.97
Q1 33.77
Median 40.9
Q3 49.65
95-th percentile 60.73
Maximum 464.97
Range 464.97
Interquartile range 15.88

Descriptive statistics

Standard deviation 11.849
Coef of variation 0.28673
Kurtosis 0.64404
Mean 41.325
MAD 9.4909
Skewness 0.023451
Sum 290600000
Variance 140.4
Memory size 53.7 MiB
Value Count Frequency (%)  
0.0 21439 0.3%
 
37.88 16169 0.2%
 
37.17 15799 0.2%
 
39.38 14688 0.2%
 
36.49 14293 0.2%
 
37.52 13752 0.2%
 
41.43 13607 0.2%
 
44.19 13271 0.2%
 
47.92 13242 0.2%
 
46.25 13145 0.2%
 
Other values (7467) 6882791 97.9%
 

Minimum 5 values

Value Count Frequency (%)  
0.0 21439 0.3%
 
0.72 1 0.0%
 
0.73 1 0.0%
 
0.74 1 0.0%
 
0.77 1 0.0%
 

Maximum 5 values

Value Count Frequency (%)  
315.66 1 0.0%
 
320.67 1 0.0%
 
331.41 1 0.0%
 
331.52 1 0.0%
 
464.97 1 0.0%
 

EDR_VNDR
Categorical

Distinct count 2
Unique (%) 0.0%
Missing (%) 0.0%
Missing (n) 0
PRT
3591909
SALIENT
3440288
Value Count Frequency (%)  
PRT 3591909 51.1%
 
SALIENT 3440288 48.9%
 

EQP_AXLE_NBR
Numeric

Distinct count 10
Unique (%) 0.0%
Missing (%) 0.0%
Missing (n) 1
Infinite (%) 0.0%
Infinite (n) 0
Mean 2.9314
Minimum 1
Maximum 9
Zeros (%) 0.0%

Quantile statistics

Minimum 1
5-th percentile 1
Q1 2
Median 3
Q3 4
95-th percentile 7
Maximum 9
Range 8
Interquartile range 2

Descriptive statistics

Standard deviation 1.7084
Coef of variation 0.58278
Kurtosis 1.8404
Mean 2.9314
MAD 1.2857
Skewness 1.1971
Sum 20614000
Variance 2.9185
Memory size 53.7 MiB
Value Count Frequency (%)  
4.0 1637180 23.3%
 
1.0 1617814 23.0%
 
3.0 1568637 22.3%
 
2.0 1498626 21.3%
 
5.0 179485 2.6%
 
6.0 178403 2.5%
 
8.0 132484 1.9%
 
7.0 129519 1.8%
 
9.0 90048 1.3%
 
(Missing) 1 0.0%
 

Minimum 5 values

Value Count Frequency (%)  
1.0 1617814 23.0%
 
2.0 1498626 21.3%
 
3.0 1568637 22.3%
 
4.0 1637180 23.3%
 
5.0 179485 2.6%
 

Maximum 5 values

Value Count Frequency (%)  
5.0 179485 2.6%
 
6.0 178403 2.5%
 
7.0 129519 1.8%
 
8.0 132484 1.9%
 
9.0 90048 1.3%
 

EQP_GRP
Categorical

Distinct count 10
Unique (%) 0.0%
Missing (%) 0.0%
Missing (n) 1
GOND
2748507
HOPP
1957786
IFLT
1506983
Other values (6)
818920
Value Count Frequency (%)  
GOND 2748507 39.1%
 
HOPP 1957786 27.8%
 
IFLT 1506983 21.4%
 
TANK 388373 5.5%
 
BOXC 216693 3.1%
 
FLAT 129522 1.8%
 
VFLT 73356 1.0%
 
MISC 7770 0.1%
 
MFLT 3206 0.0%
 
(Missing) 1 0.0%
 

EQP_GRS_TONS
Numeric

Distinct count 4767
Unique (%) 0.1%
Missing (%) 0.0%
Missing (n) 1
Infinite (%) 0.0%
Infinite (n) 0
Mean 113.05
Minimum 0
Maximum 793
Zeros (%) 0.0%

Quantile statistics

Minimum 0
5-th percentile 21.4
Q1 31.3
Median 128.8
Q3 146
95-th percentile 287.4
Maximum 793
Range 793
Interquartile range 114.7

Descriptive statistics

Standard deviation 83.947
Coef of variation 0.74257
Kurtosis 1.3165
Mean 113.05
MAD 66.054
Skewness 1.0337
Sum 794980000
Variance 7047
Memory size 53.7 MiB
Value Count Frequency (%)  
21.5 37986 0.5%
 
21.3 37764 0.5%
 
21.4 37500 0.5%
 
21.6 37265 0.5%
 
21.2 36536 0.5%
 
21.7 36319 0.5%
 
21.8 35037 0.5%
 
21.1 34143 0.5%
 
21.9 32990 0.5%
 
22.0 32327 0.5%
 
Other values (4756) 6674329 94.9%
 

Minimum 5 values

Value Count Frequency (%)  
0.0 15 0.0%
 
0.5 1 0.0%
 
0.6 1 0.0%
 
1.2 1 0.0%
 
1.5 1 0.0%
 

Maximum 5 values

Value Count Frequency (%)  
728.0 1 0.0%
 
734.2 1 0.0%
 
767.8 1 0.0%
 
792.5 1 0.0%
 
793.0 1 0.0%
 

EQP_SEQ_NBR
Numeric

Distinct count 172
Unique (%) 0.0%
Missing (%) 0.0%
Missing (n) 1
Infinite (%) 0.0%
Infinite (n) 0
Mean 51.689
Minimum 1
Maximum 178
Zeros (%) 0.0%

Quantile statistics

Minimum 1
5-th percentile 7
Q1 21
Median 44
Q3 80
95-th percentile 115
Maximum 178
Range 177
Interquartile range 59

Descriptive statistics

Standard deviation 35.431
Coef of variation 0.68547
Kurtosis -0.88678
Mean 51.689
MAD 30.507
Skewness 0.50003
Sum 363490000
Variance 1255.4
Memory size 53.7 MiB
Value Count Frequency (%)  
7.0 101253 1.4%
 
6.0 101155 1.4%
 
11.0 101096 1.4%
 
9.0 100495 1.4%
 
10.0 100329 1.4%
 
5.0 100276 1.4%
 
12.0 100079 1.4%
 
8.0 99146 1.4%
 
14.0 98684 1.4%
 
13.0 98612 1.4%
 
Other values (161) 6031071 85.8%
 

Minimum 5 values

Value Count Frequency (%)  
1.0 26 0.0%
 
2.0 2976 0.0%
 
3.0 54404 0.8%
 
4.0 83419 1.2%
 
5.0 100276 1.4%
 

Maximum 5 values

Value Count Frequency (%)  
167.0 6 0.0%
 
168.0 2 0.0%
 
170.0 2 0.0%
 
174.0 1 0.0%
 
178.0 2 0.0%
 

GRS_RAIL_WGT
Highly correlated

This variable is highly correlated with TARE and should be ignored for analysis

Correlation 0.9196

KIPDAYS
Numeric

Distinct count 854015
Unique (%) 12.1%
Missing (%) 0.0%
Missing (n) 1
Infinite (%) 0.0%
Infinite (n) 0
Mean 421.63
Minimum -999
Maximum 50548
Zeros (%) 0.0%

Quantile statistics

Minimum -999
5-th percentile -999
Q1 -999
Median -999
Q3 493.46
95-th percentile 6738.7
Maximum 50548
Range 51547
Interquartile range 1492.5

Descriptive statistics

Standard deviation 3211.6
Coef of variation 7.6171
Kurtosis 20.424
Mean 421.63
MAD 2047.2
Skewness 3.7285
Sum 2965000000
Variance 10314000
Memory size 53.7 MiB
Value Count Frequency (%)  
-999.0 5035311 71.6%
 
65.54 35 0.0%
 
65.76 33 0.0%
 
65.18 32 0.0%
 
66.76 31 0.0%
 
66.12 30 0.0%
 
65.2 30 0.0%
 
66.7 30 0.0%
 
130.36 30 0.0%
 
67.98 29 0.0%
 
Other values (854004) 1996605 28.4%
 

Minimum 5 values

Value Count Frequency (%)  
-999.0 5035311 71.6%
 
1.35 1 0.0%
 
1.84 1 0.0%
 
2.04 1 0.0%
 
2.83 1 0.0%
 

Maximum 5 values

Value Count Frequency (%)  
50152.44 1 0.0%
 
50251.36 1 0.0%
 
50350.28 1 0.0%
 
50449.2 1 0.0%
 
50548.12 1 0.0%
 

LOAD_EMPTY
Categorical

Distinct count 2
Unique (%) 0.0%
Missing (%) 0.0%
Missing (n) 0
L
4095616
E
2936581
Value Count Frequency (%)  
L 4095616 58.2%
 
E 2936581 41.8%
 

RPR_WHY_CD
Numeric

Distinct count 33
Unique (%) 0.0%
Missing (%) 0.0%
Missing (n) 1
Infinite (%) 0.0%
Infinite (n) 0
Mean 32.25
Minimum 3
Maximum 89
Zeros (%) 0.0%

Quantile statistics

Minimum 3
5-th percentile 11
Q1 11
Median 11
Q3 64
95-th percentile 75
Maximum 89
Range 86
Interquartile range 53

Descriptive statistics

Standard deviation 26.551
Coef of variation 0.82329
Kurtosis -1.6809
Mean 32.25
MAD 25.633
Skewness 0.49314
Sum 226790000
Variance 704.97
Memory size 53.7 MiB
Value Count Frequency (%)  
11.0 4121421 58.6%
 
65.0 1170951 16.7%
 
60.0 444095 6.3%
 
64.0 386611 5.5%
 
75.0 318511 4.5%
 
61.0 228594 3.3%
 
25.0 122394 1.7%
 
73.0 58038 0.8%
 
78.0 54055 0.8%
 
7.0 52809 0.8%
 
Other values (22) 74717 1.1%
 

Minimum 5 values

Value Count Frequency (%)  
3.0 3380 0.0%
 
7.0 52809 0.8%
 
8.0 761 0.0%
 
11.0 4121421 58.6%
 
13.0 10608 0.2%
 

Maximum 5 values

Value Count Frequency (%)  
81.0 16 0.0%
 
83.0 345 0.0%
 
84.0 11 0.0%
 
85.0 269 0.0%
 
89.0 19 0.0%
 

TARE
Numeric

Distinct count 102
Unique (%) 0.0%
Missing (%) 0.0%
Missing (n) 1
Infinite (%) 0.0%
Infinite (n) 0
Mean 37.008
Minimum 16
Maximum 241
Zeros (%) 0.0%

Quantile statistics

Minimum 16
5-th percentile 20
Q1 21
Median 29
Q3 37
95-th percentile 89
Maximum 241
Range 225
Interquartile range 16

Descriptive statistics

Standard deviation 23.106
Coef of variation 0.62435
Kurtosis 0.85242
Mean 37.008
MAD 18.024
Skewness 1.4847
Sum 260250000
Variance 533.89
Memory size 53.7 MiB
Value Count Frequency (%)  
21.0 1278786 18.2%
 
20.0 1006582 14.3%
 
31.0 624189 8.9%
 
30.0 541697 7.7%
 
24.0 527031 7.5%
 
22.0 280973 4.0%
 
88.0 208298 3.0%
 
89.0 202457 2.9%
 
32.0 178226 2.5%
 
25.0 143593 2.0%
 
Other values (91) 2040364 29.0%
 

Minimum 5 values

Value Count Frequency (%)  
16.0 120 0.0%
 
20.0 1006582 14.3%
 
21.0 1278786 18.2%
 
22.0 280973 4.0%
 
23.0 116546 1.7%
 

Maximum 5 values

Value Count Frequency (%)  
160.0 1 0.0%
 
183.0 20 0.0%
 
189.0 3 0.0%
 
206.0 8 0.0%
 
241.0 1 0.0%
 

VNDR_EDR_NME
Categorical

Distinct count 23
Unique (%) 0.0%
Missing (%) 0.0%
Missing (n) 0
AURO
 
693626
AMAZ
 
573707
VELO
 
549669
Other values (20)
5215195
Value Count Frequency (%)  
AURO 693626 9.9%
 
AMAZ 573707 8.2%
 
VELO 549669 7.8%
 
ROGN 446753 6.4%
 
JOPP 429930 6.1%
 
BAGD 414028 5.9%
 
BMRK 392370 5.6%
 
GAGE 360760 5.1%
 
COCH 329820 4.7%
 
CHRS 318541 4.5%
 
Other values (13) 2522993 35.9%
 

WHL_AVG_KIPS
Numeric

Distinct count 6928
Unique (%) 0.1%
Missing (%) 0.0%
Missing (n) 1
Infinite (%) 0.0%
Infinite (n) 0
Mean 21.85
Minimum 0.01
Maximum 106.95
Zeros (%) 0.0%

Quantile statistics

Minimum 0.01
5-th percentile 5.15
Q1 7.62
Median 23.41
Q3 34.63
95-th percentile 39.74
Maximum 106.95
Range 106.94
Interquartile range 27.01

Descriptive statistics

Standard deviation 13.319
Coef of variation 0.60957
Kurtosis -1.5672
Mean 21.85
MAD 12.394
Skewness 0.035537
Sum 153650000
Variance 177.39
Memory size 53.7 MiB
Value Count Frequency (%)  
5.51 8443 0.1%
 
5.53 8293 0.1%
 
5.55 8270 0.1%
 
5.57 8212 0.1%
 
5.59 8167 0.1%
 
5.5 8138 0.1%
 
5.45 8039 0.1%
 
5.54 8038 0.1%
 
5.44 8024 0.1%
 
5.48 8024 0.1%
 
Other values (6917) 6950548 98.8%
 

Minimum 5 values

Value Count Frequency (%)  
0.01 1 0.0%
 
0.12 2 0.0%
 
0.14 1 0.0%
 
0.15 1 0.0%
 
0.16 1 0.0%
 

Maximum 5 values

Value Count Frequency (%)  
96.12 1 0.0%
 
96.25 1 0.0%
 
99.78 1 0.0%
 
104.89 1 0.0%
 
106.95 1 0.0%
 

WHL_DYN_KIPS
Numeric

Distinct count 9610
Unique (%) 0.1%
Missing (%) 0.0%
Missing (n) 1
Infinite (%) 0.0%
Infinite (n) 0
Mean 7.457
Minimum -41.3
Maximum 192.11
Zeros (%) 0.2%

Quantile statistics

Minimum -41.3
5-th percentile 0.89
Q1 2.28
Median 4.42
Q3 8.39
95-th percentile 26.78
Maximum 192.11
Range 233.41
Interquartile range 6.11

Descriptive statistics

Standard deviation 8.8316
Coef of variation 1.1843
Kurtosis 9.2246
Mean 7.457
MAD 5.8559
Skewness 2.6941
Sum 52439000
Variance 77.997
Memory size 53.7 MiB
Value Count Frequency (%)  
0.0 12494 0.2%
 
1.75 10931 0.2%
 
1.79 10886 0.2%
 
1.67 10770 0.2%
 
1.5 10752 0.2%
 
1.46 10740 0.2%
 
1.69 10735 0.2%
 
1.84 10724 0.2%
 
1.64 10716 0.2%
 
1.73 10711 0.2%
 
Other values (9599) 6922737 98.4%
 

Minimum 5 values

Value Count Frequency (%)  
-41.3 1 0.0%
 
-40.51 1 0.0%
 
-24.25 1 0.0%
 
-20.48 1 0.0%
 
-19.68 1 0.0%
 

Maximum 5 values

Value Count Frequency (%)  
150.19 1 0.0%
 
155.79 1 0.0%
 
158.86 1 0.0%
 
164.01 1 0.0%
 
192.11 1 0.0%
 

WHL_DYN_RATIO
Numeric

Distinct count 1163
Unique (%) 0.0%
Missing (%) 0.0%
Missing (n) 1
Infinite (%) 0.0%
Infinite (n) 0
Mean 1.4464
Minimum 0
Maximum 138.6
Zeros (%) 0.0%

Quantile statistics

Minimum 0
5-th percentile 1.07
Q1 1.14
Median 1.24
Q3 1.47
95-th percentile 2.58
Maximum 138.6
Range 138.6
Interquartile range 0.33

Descriptive statistics

Standard deviation 0.61704
Coef of variation 0.42661
Kurtosis 538.03
Mean 1.4464
MAD 0.36629
Skewness 6.7414
Sum 10171000
Variance 0.38074
Memory size 53.7 MiB
Value Count Frequency (%)  
1.14 221224 3.1%
 
1.15 216862 3.1%
 
1.13 215972 3.1%
 
1.16 211383 3.0%
 
1.12 203414 2.9%
 
1.17 202734 2.9%
 
1.11 199826 2.8%
 
1.18 193173 2.7%
 
1.1 192162 2.7%
 
1.19 182990 2.6%
 
Other values (1152) 4992456 71.0%
 

Minimum 5 values

Value Count Frequency (%)  
0.0 33 0.0%
 
0.04 1 0.0%
 
0.11 1 0.0%
 
0.13 1 0.0%
 
0.24 1 0.0%
 

Maximum 5 values

Value Count Frequency (%)  
61.6 1 0.0%
 
61.62 1 0.0%
 
87.74 1 0.0%
 
94.9 1 0.0%
 
138.6 1 0.0%
 

WHL_PEAK_KIPS
Numeric

Distinct count 11758
Unique (%) 0.2%
Missing (%) 0.0%
Missing (n) 1
Infinite (%) 0.0%
Infinite (n) 0
Mean 29.307
Minimum 0
Maximum 227.07
Zeros (%) 0.0%

Quantile statistics

Minimum 0
5-th percentile 6.76
Q1 11.33
Median 30.61
Q3 41.63
95-th percentile 59.74
Maximum 227.07
Range 227.07
Interquartile range 30.3

Descriptive statistics

Standard deviation 17.902
Coef of variation 0.61086
Kurtosis 0.14581
Mean 29.307
MAD 15.196
Skewness 0.56538
Sum 206090000
Variance 320.49
Memory size 53.7 MiB
Value Count Frequency (%)  
7.5 5716 0.1%
 
7.25 5598 0.1%
 
7.0 5543 0.1%
 
7.75 5531 0.1%
 
7.17 5357 0.1%
 
7.38 5345 0.1%
 
7.42 5323 0.1%
 
7.3 5274 0.1%
 
7.32 5264 0.1%
 
7.4 5256 0.1%
 
Other values (11747) 6977989 99.2%
 

Minimum 5 values

Value Count Frequency (%)  
0.0 33 0.0%
 
0.01 1 0.0%
 
0.06 1 0.0%
 
0.07 1 0.0%
 
0.13 1 0.0%
 

Maximum 5 values

Value Count Frequency (%)  
184.27 1 0.0%
 
185.78 1 0.0%
 
196.37 1 0.0%
 
205.36 1 0.0%
 
227.07 1 0.0%
 

trn_id
Categorical

Distinct count 115887
Unique (%) 1.6%
Missing (%) 0.0%
Missing (n) 0
SOSFVFR214K
 
1591
SOSFVFR218K
 
1461
SOSFVFR226K
 
1354
Other values (115884)
7027791
Value Count Frequency (%)  
SOSFVFR214K 1591 0.0%
 
SOSFVFR218K 1461 0.0%
 
SOSFVFR226K 1354 0.0%
 
SOSFVFR220K 1325 0.0%
 
SOSFVFR228K 1311 0.0%
 
SOSFVFR217K 1310 0.0%
 
SOSFVFR223K 1298 0.0%
 
SOSFVFR113K 1297 0.0%
 
SOSFVFR225K 1281 0.0%
 
SOSFVFR227K 1237 0.0%
 
Other values (115877) 7018732 99.8%
 

unq_whl_id
Categorical

Distinct count 113422
Unique (%) 1.6%
Missing (%) 0.0%
Missing (n) 0
EQVI586823L5-2010-05-20::2014-12-08
 
463
IHF04793R2-2011-08-07::2014-08-07
 
452
UDZA522R4-2011-05-26::2014-05-26
 
451
Other values (113419)
7030831
Value Count Frequency (%)  
EQVI586823L5-2010-05-20::2014-12-08 463 0.0%
 
IHF04793R2-2011-08-07::2014-08-07 452 0.0%
 
UDZA522R4-2011-05-26::2014-05-26 451 0.0%
 
UDZA561R2-2011-05-27::2014-05-27 451 0.0%
 
UDZA522L2-2011-05-26::2014-05-26 451 0.0%
 
WWHA686285L3-2011-08-29::2014-08-29 449 0.0%
 
UDZA583R3-2011-05-26::2014-05-26 433 0.0%
 
UDZA583L4-2011-05-26::2014-05-26 433 0.0%
 
XWOA545178R1-2011-11-23::2014-11-23 425 0.0%
 
UDZA560R2-2011-05-26::2014-05-26 425 0.0%
 
Other values (113412) 7027764 99.9%
 

vnder_edr_ts_cst
Date

Distinct count 451346
Unique (%) 6.4%
Missing (%) 0.0%
Missing (n) 0
Infinite (%) 0.0%
Infinite (n) 0
Minimum 2012-07-01 00:01:00
Maximum 2014-12-31 15:23:00

Correlations

Sample

unq_whl_id LOAD_EMPTY trn_id vnder_edr_ts_cst VNDR_EDR_NME EDR_VNDR AGE AXLE_SIDE EQP_AXLE_NBR EQP_SEQ_NBR WHL_AVG_KIPS WHL_PEAK_KIPS WHL_DYN_KIPS WHL_DYN_RATIO EDR_EQP_SPD EQP_GRS_TONS TARE AAR_CT_C EQP_GRP GRS_RAIL_WGT KIPDAYS RPR_WHY_CD
0 EQVI503147L7-2011-05-28::2014-03-08 L SOSFVFR229K 2013-06-01 03:01:00 BAGD SALIENT 735 L 7.0 10.0 36.35 40.24 3.89 1.11 41.57 345.6 94.0 S162 IFLT 800000.0 -999.00 11.0
1 EQVI717414R1-2013-05-15::2013-09-17 E XSJDVPQ914G 2013-08-15 04:55:00 COCH SALIENT 92 R 1.0 73.0 7.23 10.81 3.58 1.50 56.95 30.4 30.0 C114 HOPP 286000.0 -999.00 11.0
2 EQVI587556R3-2013-05-13::2014-08-12 L QDWJODF329A 2013-12-03 05:48:00 BAGD SALIENT 204 R 3.0 4.0 32.47 46.97 14.50 1.45 45.79 194.2 65.0 S635 IFLT 487000.0 117.58 65.0
3 GHWA626L4-2010-07-22::2014-08-28 L CVFPVXG020A 2014-02-26 17:58:00 BMRK SALIENT 1315 L 4.0 103.0 32.13 36.99 4.86 1.15 21.78 138.6 21.0 J311 GOND 286000.0 368.73 11.0
4 WLOA76475R3-2011-06-21::2014-06-21 E EWKKQDP068A 2014-05-02 04:36:00 AURO SALIENT 1046 R 3.0 77.0 5.53 7.84 2.31 1.42 47.51 23.0 20.0 J311 GOND 286000.0 -999.00 64.0
In [7]:
df_train.sort_values(by=['unq_whl_id','vnder_edr_ts_cst'], inplace=True)
In [8]:
# Number of wheels
print ('Total number of wheels in the data set:',df_train.unq_whl_id.value_counts().shape[0])
Total number of wheels in the data set: 113422
In [9]:
df_train['IS_CRITICAL'] = df_train.WHL_PEAK_KIPS.apply(lambda x: True if x >= 90 else False)
In [10]:
df_plot = df_train.groupby('unq_whl_id').IS_CRITICAL.sum().value_counts()

data = [go.Bar(
            x=df_plot.index,
            y=df_plot
    )]
layout = go.Layout(title='Number of critical peaks per wheel',
                  xaxis=dict(title='Number of critical peaks'),
                  yaxis=dict(title='Number of wheels'))

fig = go.Figure(data=data, layout=layout)
iplot(fig,filename='basic-bar')

Research questions:

  1. Is a wheel more likely to have the second peak after the first?
  2. Is the time between two subsequent peaks shorter than time to the first peak?

Problem reformulation:

Predict the peak kips value of an empty car at the next loaded status within next 10 days. The goal is to predict the first peak kip for each wheal

Feature engineering

  • Rollup window features, average and maximum:

    • peak kips per wheel
    • dynamic kips
    • tons
    • ...
  • Next state features to create a target

    • Next state loaded or empty
    • Time to next state (days)
    • Is peak in next state an alert (above 90)
  • split wheels with 0 peaks and take a random empty row with a valid next loaded peak variable

  • fir wheels with more than 0 peaks select the first empty row with next loaded peak crirtical
In [11]:
# Add feature per wheel, did it have a peak or not?
df_train = pd.merge(df_train,
                    pd.DataFrame(df_train.groupby('unq_whl_id').IS_CRITICAL.max().rename("had_peak")).reset_index(), 
                    on="unq_whl_id")
In [12]:
# Next state of the wagon - loaded or empty, peak or not peak, time distance to next state
df_train[["next_ts","next_LOAD_EMPTY","next_CRITICAL"]] = df_train.groupby("unq_whl_id")[["vnder_edr_ts_cst","LOAD_EMPTY","IS_CRITICAL"]].shift(-1)
In [13]:
# Rollup window features over last 10 days: 
def roll(g):
    # pdb.set_trace()
    g.set_index("vnder_edr_ts_cst",inplace=True)
    return g[["WHL_AVG_KIPS",
                     "WHL_PEAK_KIPS",
                     "WHL_DYN_KIPS",
                     "WHL_DYN_RATIO",
                     "EDR_EQP_SPD",
                     "EQP_GRS_TONS"]].rolling("10D").agg(["mean","max"])


df_rolled = df_train.groupby("unq_whl_id").apply(roll)
df_rolled.to_pickle("df_rolled_10d.pkl")
In [14]:
df_rolled = pd.read_pickle('df_rolled_10d.pkl')
df_rolled.head()
Out[14]:
WHL_AVG_KIPS WHL_PEAK_KIPS WHL_DYN_KIPS WHL_DYN_RATIO EDR_EQP_SPD EQP_GRS_TONS
mean max mean max mean max mean max mean max mean max
unq_whl_id vnder_edr_ts_cst
AFOA23300L2-2011-02-09::2014-02-09 2012-09-28 05:36:00 6.610 6.61 9.060000 9.06 2.450000 2.45 1.370000 1.37 37.880 37.88 26.70 26.7
2012-10-06 02:11:00 6.815 7.02 9.125000 9.19 2.310000 2.45 1.340000 1.37 30.850 37.88 25.75 26.7
2012-12-03 06:30:00 6.440 6.44 10.600000 10.60 4.160000 4.16 1.650000 1.65 29.310 29.31 25.70 25.7
2012-12-05 18:23:00 20.930 35.42 25.835000 41.07 4.905000 5.65 1.405000 1.65 28.125 29.31 85.50 145.3
2012-12-07 06:34:00 16.230 35.42 20.343333 41.07 4.113333 5.65 1.393333 1.65 30.960 36.63 65.70 145.3
In [15]:
df_train = pd.merge(df_train.set_index(["unq_whl_id","vnder_edr_ts_cst"]),df_rolled,left_index=True, right_index=True)
In [16]:
df_train.reset_index(inplace=True)

Create data set

We have features and target, can we start modelling?

In [17]:
df_train.head()
Out[17]:
unq_whl_id vnder_edr_ts_cst LOAD_EMPTY trn_id VNDR_EDR_NME EDR_VNDR AGE AXLE_SIDE EQP_AXLE_NBR EQP_SEQ_NBR WHL_AVG_KIPS WHL_PEAK_KIPS WHL_DYN_KIPS WHL_DYN_RATIO EDR_EQP_SPD EQP_GRS_TONS TARE AAR_CT_C EQP_GRP GRS_RAIL_WGT KIPDAYS RPR_WHY_CD IS_CRITICAL had_peak next_ts next_LOAD_EMPTY next_CRITICAL (WHL_AVG_KIPS, mean) (WHL_AVG_KIPS, max) (WHL_PEAK_KIPS, mean) (WHL_PEAK_KIPS, max) (WHL_DYN_KIPS, mean) (WHL_DYN_KIPS, max) (WHL_DYN_RATIO, mean) (WHL_DYN_RATIO, max) (EDR_EQP_SPD, mean) (EDR_EQP_SPD, max) (EQP_GRS_TONS, mean) (EQP_GRS_TONS, max)
0 AFOA23300L2-2011-02-09::2014-02-09 2012-09-28 05:36:00 E EPLFFGP012A ROGN SALIENT 597 L 2.0 3.0 6.61 9.06 2.45 1.37 37.88 26.7 25.0 K341 HOPP 286000.0 1747.94 11.0 False False 2012-10-06 02:11:00 E False 6.610 6.61 9.060000 9.06 2.450000 2.45 1.370000 1.37 37.880 37.88 26.70 26.7
1 AFOA23300L2-2011-02-09::2014-02-09 2012-10-06 02:11:00 E HOLQGHQ904A ROGN SALIENT 605 L 2.0 14.0 7.02 9.19 2.17 1.31 23.82 24.8 25.0 K341 HOPP 286000.0 1818.06 11.0 False False 2012-12-03 06:30:00 E False 6.815 7.02 9.125000 9.19 2.310000 2.45 1.340000 1.37 30.850 37.88 25.75 26.7
2 AFOA23300L2-2011-02-09::2014-02-09 2012-12-03 06:30:00 E EPLFEWP030A ROGN SALIENT 663 L 2.0 27.0 6.44 10.60 4.16 1.65 29.31 25.7 25.0 K341 HOPP 286000.0 1888.18 11.0 False False 2012-12-05 18:23:00 L False 6.440 6.44 10.600000 10.60 4.160000 4.16 1.650000 1.65 29.310 29.31 25.70 25.7
3 AFOA23300L2-2011-02-09::2014-02-09 2012-12-05 18:23:00 L CEDPPLF180A ROGN SALIENT 665 L 2.0 30.0 35.42 41.07 5.65 1.16 26.94 145.3 25.0 K341 HOPP 286000.0 1958.30 11.0 False False 2012-12-07 06:34:00 E False 20.930 35.42 25.835000 41.07 4.905000 5.65 1.405000 1.65 28.125 29.31 85.50 145.3
4 AFOA23300L2-2011-02-09::2014-02-09 2012-12-07 06:34:00 E EPLFEDP180A ROGN SALIENT 667 L 2.0 30.0 6.83 9.36 2.53 1.37 36.63 26.1 25.0 K341 HOPP 286000.0 2028.42 11.0 False False 2012-12-09 04:25:00 L False 16.230 35.42 20.343333 41.07 4.113333 5.65 1.393333 1.65 30.960 36.63 65.70 145.3
In [18]:
Image(url= "figures/dataset.png", width=700, height=700)
Out[18]:

Wheels without a peak kip

  • Select all wheels that:

    • never had a kip
  • For each wheel that never had a kip select only states in which:

    • car is empty
    • next state is loaded
    • next loaded state happens within 10 days
    • randomly select one state
In [19]:
df_no = df_train[(df_train.had_peak==False) &  # never had a peak
         (df_train.LOAD_EMPTY == "E") & # current state empty
         (df_train.next_LOAD_EMPTY == "L") & # next stqte loaded
         ((df_train.next_ts - df_train.vnder_edr_ts_cst).apply(lambda x: x.days<10))] # next load occurs within 10 days

df_no = df_no.groupby("unq_whl_id").apply(lambda g: g.sample(1))
df_no.drop("unq_whl_id", axis=1, inplace=True)

Wheels that had at least one peak kip

  • Select all wheels that:

    • at least one peak kip
  • For each wheel that had at least one kip, select only states in which:

    • car is empty
    • next state is loaded
    • next loaded state happens within 10 days
    • select first state where next state is with a kip (positive target)
In [20]:
df_yes = df_train[(df_train.had_peak==True) &  # had a peak
         (df_train.LOAD_EMPTY == "E") & # current state empty
         (df_train.next_LOAD_EMPTY == "L") & # next stqte loaded
         ((df_train.next_ts - df_train.vnder_edr_ts_cst).apply(lambda x: x.days<10))] # next load occurs within 10 days

df_yes = df_yes[df_yes.next_CRITICAL==True].groupby("unq_whl_id").first()
In [21]:
df = pd.concat([df_no, df_yes])
In [22]:
df_plot =  df.next_CRITICAL.value_counts()

data = [go.Bar(
            x=df_plot.index,
            y=df_plot
    )]
layout = go.Layout(title='Distribution of target',
                  xaxis=dict(title='Next peak critical'),
                  yaxis=dict(title='Count'),
                  width=600,
                  height=400)

fig = go.Figure(data=data, layout=layout)
iplot(fig,filename='basic-bar')
In [23]:
df.to_pickle('dataset_for_FS.pkl')

Initial feature selection

In [24]:
df = pd.read_pickle('dataset_for_FS.pkl')
In [25]:
df.head()
Out[25]:
vnder_edr_ts_cst LOAD_EMPTY trn_id VNDR_EDR_NME EDR_VNDR AGE AXLE_SIDE EQP_AXLE_NBR EQP_SEQ_NBR WHL_AVG_KIPS WHL_PEAK_KIPS WHL_DYN_KIPS WHL_DYN_RATIO EDR_EQP_SPD EQP_GRS_TONS TARE AAR_CT_C EQP_GRP GRS_RAIL_WGT KIPDAYS RPR_WHY_CD IS_CRITICAL had_peak next_ts next_LOAD_EMPTY next_CRITICAL (WHL_AVG_KIPS, mean) (WHL_AVG_KIPS, max) (WHL_PEAK_KIPS, mean) (WHL_PEAK_KIPS, max) (WHL_DYN_KIPS, mean) (WHL_DYN_KIPS, max) (WHL_DYN_RATIO, mean) (WHL_DYN_RATIO, max) (EDR_EQP_SPD, mean) (EDR_EQP_SPD, max) (EQP_GRS_TONS, mean) (EQP_GRS_TONS, max)
(AFOA23300L2-2011-02-09::2014-02-09, 42) 2013-02-21 19:35:00 E EPLFEDP030A ROGN SALIENT 743 L 2.0 87.0 6.11 8.91 2.80 1.46 48.31 26.2 25.0 K341 HOPP 286000.0 4692.98 11.0 False False 2013-02-23 23:43:00 L False 17.832000 38.42 23.446000 44.88 5.614000 9.59 1.458000 1.87 44.308000 55.67 74.760000 145.1
(AFOA23301L1-2009-09-12::2012-09-12, 147) 2012-08-24 20:02:00 E EPLFEWP022A ROGN SALIENT 1077 L 1.0 16.0 6.08 8.56 2.48 1.41 56.62 26.3 25.0 K341 HOPP 286000.0 -999.00 11.0 False False 2012-08-27 04:56:00 L False 23.548333 41.85 26.635000 46.10 3.086667 4.25 1.216667 1.41 43.356667 56.62 86.016667 145.3
(AFOA23308L1-2011-02-07::2014-02-07, 239) 2013-01-23 19:34:00 E EPLFDWP010A ROGN SALIENT 716 L 1.0 73.0 6.51 6.99 0.48 1.07 20.71 26.1 25.0 K341 HOPP 286000.0 -999.00 11.0 False False 2013-01-26 02:06:00 L False 24.342000 36.97 27.088000 41.70 2.746000 4.73 1.126000 1.23 31.230000 39.50 96.440000 145.0
(AFOA23309L4-2011-02-13::2014-02-13, 450) 2013-06-21 09:44:00 E EPLFEDP097A ROGN SALIENT 859 L 4.0 4.0 7.16 8.25 1.09 1.15 29.08 26.4 25.0 K341 HOPP 286000.0 -999.00 60.0 False False 2013-06-24 05:33:00 L False 7.160000 7.16 8.250000 8.25 1.090000 1.09 1.150000 1.15 29.080000 29.08 26.400000 26.4
(AFOA23313L2-2011-07-02::2014-07-02, 566) 2012-12-08 19:18:00 E EPLFDWP088A ROGN SALIENT 525 L 2.0 57.0 6.71 7.87 1.16 1.17 36.02 26.0 25.0 K341 HOPP 286000.0 -999.00 11.0 False False 2012-12-11 08:27:00 L False 21.556667 37.97 23.621667 40.77 2.065000 3.74 1.121667 1.18 36.666667 39.53 85.533333 145.8
In [26]:
features = ['VNDR_EDR_NME',
            'EDR_VNDR',
            'AGE',
            'AXLE_SIDE',
            'EQP_AXLE_NBR',
            'EQP_SEQ_NBR',
            'WHL_AVG_KIPS',
            'WHL_PEAK_KIPS',
            'WHL_DYN_KIPS',
            'WHL_DYN_RATIO',
            'EDR_EQP_SPD',
            'EQP_GRS_TONS',
            'TARE',
            'EQP_GRP',
            'GRS_RAIL_WGT',
            'KIPDAYS',            
            ('WHL_AVG_KIPS', 'mean'),           
            ('WHL_AVG_KIPS', 'max'),
            ('WHL_PEAK_KIPS', 'mean'),
            ('WHL_PEAK_KIPS', 'max'),
            ('WHL_DYN_KIPS', 'mean'),
            ('WHL_DYN_KIPS', 'max'),
            ('WHL_DYN_RATIO', 'mean'),
            ('WHL_DYN_RATIO', 'max'),
            ('EDR_EQP_SPD', 'mean'),
            ('EDR_EQP_SPD', 'max'),
            ('EQP_GRS_TONS', 'mean'),
            ('EQP_GRS_TONS', 'max')]

target = 'next_CRITICAL'

Encoding categorical variables

  • One hot encoding
In [27]:
df[features].head()
Out[27]:
VNDR_EDR_NME EDR_VNDR AGE AXLE_SIDE EQP_AXLE_NBR EQP_SEQ_NBR WHL_AVG_KIPS WHL_PEAK_KIPS WHL_DYN_KIPS WHL_DYN_RATIO EDR_EQP_SPD EQP_GRS_TONS TARE EQP_GRP GRS_RAIL_WGT KIPDAYS (WHL_AVG_KIPS, mean) (WHL_AVG_KIPS, max) (WHL_PEAK_KIPS, mean) (WHL_PEAK_KIPS, max) (WHL_DYN_KIPS, mean) (WHL_DYN_KIPS, max) (WHL_DYN_RATIO, mean) (WHL_DYN_RATIO, max) (EDR_EQP_SPD, mean) (EDR_EQP_SPD, max) (EQP_GRS_TONS, mean) (EQP_GRS_TONS, max)
(AFOA23300L2-2011-02-09::2014-02-09, 42) ROGN SALIENT 743 L 2.0 87.0 6.11 8.91 2.80 1.46 48.31 26.2 25.0 HOPP 286000.0 4692.98 17.832000 38.42 23.446000 44.88 5.614000 9.59 1.458000 1.87 44.308000 55.67 74.760000 145.1
(AFOA23301L1-2009-09-12::2012-09-12, 147) ROGN SALIENT 1077 L 1.0 16.0 6.08 8.56 2.48 1.41 56.62 26.3 25.0 HOPP 286000.0 -999.00 23.548333 41.85 26.635000 46.10 3.086667 4.25 1.216667 1.41 43.356667 56.62 86.016667 145.3
(AFOA23308L1-2011-02-07::2014-02-07, 239) ROGN SALIENT 716 L 1.0 73.0 6.51 6.99 0.48 1.07 20.71 26.1 25.0 HOPP 286000.0 -999.00 24.342000 36.97 27.088000 41.70 2.746000 4.73 1.126000 1.23 31.230000 39.50 96.440000 145.0
(AFOA23309L4-2011-02-13::2014-02-13, 450) ROGN SALIENT 859 L 4.0 4.0 7.16 8.25 1.09 1.15 29.08 26.4 25.0 HOPP 286000.0 -999.00 7.160000 7.16 8.250000 8.25 1.090000 1.09 1.150000 1.15 29.080000 29.08 26.400000 26.4
(AFOA23313L2-2011-07-02::2014-07-02, 566) ROGN SALIENT 525 L 2.0 57.0 6.71 7.87 1.16 1.17 36.02 26.0 25.0 HOPP 286000.0 -999.00 21.556667 37.97 23.621667 40.77 2.065000 3.74 1.121667 1.18 36.666667 39.53 85.533333 145.8
In [28]:
df['EQP_AXLE_NBR'] = df['EQP_AXLE_NBR'].astype(int)

categorical_features = ['VNDR_EDR_NME',
                       'EDR_VNDR',
                       'AXLE_SIDE',
                       'EQP_AXLE_NBR',
                       'EQP_GRP']
In [29]:
X = pd.get_dummies(data=df[features],columns=categorical_features)
y = df[target].astype(bool)
In [30]:
X.head()
Out[30]:
AGE EQP_SEQ_NBR WHL_AVG_KIPS WHL_PEAK_KIPS WHL_DYN_KIPS WHL_DYN_RATIO EDR_EQP_SPD EQP_GRS_TONS TARE GRS_RAIL_WGT KIPDAYS (WHL_AVG_KIPS, mean) (WHL_AVG_KIPS, max) (WHL_PEAK_KIPS, mean) (WHL_PEAK_KIPS, max) (WHL_DYN_KIPS, mean) (WHL_DYN_KIPS, max) (WHL_DYN_RATIO, mean) (WHL_DYN_RATIO, max) (EDR_EQP_SPD, mean) (EDR_EQP_SPD, max) (EQP_GRS_TONS, mean) (EQP_GRS_TONS, max) VNDR_EDR_NME_AMAZ VNDR_EDR_NME_AURO VNDR_EDR_NME_BAGD VNDR_EDR_NME_BMRK VNDR_EDR_NME_CHEL VNDR_EDR_NME_CHRS VNDR_EDR_NME_COCH VNDR_EDR_NME_DEAN VNDR_EDR_NME_ESTR VNDR_EDR_NME_GAGE VNDR_EDR_NME_JOPP VNDR_EDR_NME_KIRK VNDR_EDR_NME_KYRO VNDR_EDR_NME_LEED VNDR_EDR_NME_MALT VNDR_EDR_NME_MESA VNDR_EDR_NME_PEST VNDR_EDR_NME_PRAI VNDR_EDR_NME_ROGN VNDR_EDR_NME_SEYM VNDR_EDR_NME_VELO VNDR_EDR_NME_WHET VNDR_EDR_NME_WINT EDR_VNDR_PRT EDR_VNDR_SALIENT AXLE_SIDE_L AXLE_SIDE_R EQP_AXLE_NBR_1 EQP_AXLE_NBR_2 EQP_AXLE_NBR_3 EQP_AXLE_NBR_4 EQP_AXLE_NBR_5 EQP_AXLE_NBR_6 EQP_AXLE_NBR_7 EQP_AXLE_NBR_8 EQP_AXLE_NBR_9 EQP_GRP_BOXC EQP_GRP_FLAT EQP_GRP_GOND EQP_GRP_HOPP EQP_GRP_IFLT EQP_GRP_MFLT EQP_GRP_MISC EQP_GRP_TANK EQP_GRP_VFLT
(AFOA23300L2-2011-02-09::2014-02-09, 42) 743 87.0 6.11 8.91 2.80 1.46 48.31 26.2 25.0 286000.0 4692.98 17.832000 38.42 23.446000 44.88 5.614000 9.59 1.458000 1.87 44.308000 55.67 74.760000 145.1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 1 1 0 0 1 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0
(AFOA23301L1-2009-09-12::2012-09-12, 147) 1077 16.0 6.08 8.56 2.48 1.41 56.62 26.3 25.0 286000.0 -999.00 23.548333 41.85 26.635000 46.10 3.086667 4.25 1.216667 1.41 43.356667 56.62 86.016667 145.3 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 1 1 0 1 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0
(AFOA23308L1-2011-02-07::2014-02-07, 239) 716 73.0 6.51 6.99 0.48 1.07 20.71 26.1 25.0 286000.0 -999.00 24.342000 36.97 27.088000 41.70 2.746000 4.73 1.126000 1.23 31.230000 39.50 96.440000 145.0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 1 1 0 1 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0
(AFOA23309L4-2011-02-13::2014-02-13, 450) 859 4.0 7.16 8.25 1.09 1.15 29.08 26.4 25.0 286000.0 -999.00 7.160000 7.16 8.250000 8.25 1.090000 1.09 1.150000 1.15 29.080000 29.08 26.400000 26.4 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 1 1 0 0 0 0 1 0 0 0 0 0 0 0 0 1 0 0 0 0 0
(AFOA23313L2-2011-07-02::2014-07-02, 566) 525 57.0 6.71 7.87 1.16 1.17 36.02 26.0 25.0 286000.0 -999.00 21.556667 37.97 23.621667 40.77 2.065000 3.74 1.121667 1.18 36.666667 39.53 85.533333 145.8 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 1 1 0 0 1 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0

Baseline model

Split training and test data

In [31]:
Image(url= "figures/overfitting.png", width=700, height=700)
Out[31]:
In [32]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=42)
In [33]:
# #############################################################################
# Benchmark classifiers

def benchmark(clf, print_report=True, print_cm=True):
    print('_' * 80)
    print("Training: ")
    print(clf)
    t0 = time()
    clf.fit(X_train, y_train)
    train_time = time() - t0
    print("train time: %0.3fs" % train_time)

    t0 = time()
    pred = clf.predict(X_test)
    test_time = time() - t0
    print("test time:  %0.3fs" % test_time)

    score = metrics.accuracy_score(y_test, pred)
    fpr, tpr, roc_thresholds = metrics.roc_curve(y_test, pred, pos_label=True)
    prec_crd, rec_crd, pr_thresholds = metrics.roc_curve(y_test, pred, pos_label=True)
    auc = metrics.auc(fpr, tpr) 
    precision = metrics.precision_score(y_test, pred)
    recall = metrics.recall_score(y_test, pred)
    print("accuracy:   %0.3f" % score)

    if hasattr(clf, 'coef_'):
        print("dimensionality: %d" % clf.coef_.shape[1])
        print("density: %f" % density(clf.coef_))

        print()

    if print_report:
        print("classification report:")
        print(metrics.classification_report(y_test, pred, target_names = ['Peak','No peak']))

    if print_cm:
        print("confusion matrix:")
        print(metrics.confusion_matrix(y_test, pred))

    print()
    clf_descr = str(clf).split('(')[0]
    return clf_descr, score, auc, precision, recall


results = []
for clf, name in (
        (RidgeClassifier(tol=1e-2, solver="sag"), "Ridge Classifier"),
        (Perceptron(max_iter=50, tol=1e-3), "Perceptron"),
        (PassiveAggressiveClassifier(max_iter=50, tol=1e-3),
         "Passive-Aggressive"),
        (KNeighborsClassifier(n_neighbors=10), "kNN"),
        (RandomForestClassifier(n_estimators=100), "Random forest")):
    print('=' * 80)
    print(name)
    results.append(benchmark(clf))

for penalty in ["l2", "l1"]:
    print('=' * 80)
    print("%s penalty" % penalty.upper())
    # Train Liblinear model
    results.append(benchmark(LinearSVC(penalty=penalty, dual=False,
                                       tol=1e-3)))

    # Train SGD model
    results.append(benchmark(SGDClassifier(alpha=.0001, max_iter=50,
                                           penalty=penalty)))

# Train SGD with Elastic Net penalty
print('=' * 80)
print("Elastic-Net penalty")
results.append(benchmark(SGDClassifier(alpha=.0001, max_iter=50,
                                       penalty="elasticnet")))

# Train NearestCentroid without threshold
print('=' * 80)
print("NearestCentroid (aka Rocchio classifier)")
results.append(benchmark(NearestCentroid()))

# Train sparse Naive Bayes classifiers
print('=' * 80)
print("Naive Bayes")
results.append(benchmark(BernoulliNB(alpha=.01)))

print('=' * 80)
print("LinearSVC with L1-based feature selection")
# The smaller C, the stronger the regularization.
# The more regularization, the more sparsity.
results.append(benchmark(Pipeline([
  ('feature_selection', SelectFromModel(LinearSVC(penalty="l1", dual=False,
                                                  tol=1e-3))),
  ('classification', LinearSVC(penalty="l2"))])))

# make some plots

indices = np.arange(0,len(results)*2,2)

results = [[x[i] for x in results] for i in range(5)]


clf_names, score, auc, precision, recall = results


plt.figure(figsize=(18, 8))
plt.title("Score")
plt.barh(indices, score, .2, label="score", color='navy')
plt.barh(indices + .3, auc, .2, label="AUC", color='c')
plt.barh(indices + .6, precision, .2, label="precision", color='darkorange')
plt.barh(indices + .9, recall, .2, label="recall", color='red')



plt.yticks(())
plt.legend(loc='best')
plt.subplots_adjust(left=.25)
plt.subplots_adjust(top=.95)
plt.subplots_adjust(bottom=.05)

for i, c in zip(indices, clf_names):
    plt.text(-.3, i, c)

plt.show()
================================================================================
Ridge Classifier
________________________________________________________________________________
Training: 
RidgeClassifier(alpha=1.0, class_weight=None, copy_X=True, fit_intercept=True,
        max_iter=None, normalize=False, random_state=None, solver='sag',
        tol=0.01)
train time: 0.601s
test time:  0.008s
accuracy:   0.923
dimensionality: 68
density: 1.000000

classification report:
              precision    recall  f1-score   support

        Peak       0.93      0.99      0.96     21223
     No peak       0.21      0.02      0.04      1657

   micro avg       0.92      0.92      0.92     22880
   macro avg       0.57      0.51      0.50     22880
weighted avg       0.88      0.92      0.89     22880

confusion matrix:
[[21093   130]
 [ 1622    35]]

================================================================================
Perceptron
________________________________________________________________________________
Training: 
Perceptron(alpha=0.0001, class_weight=None, early_stopping=False, eta0=1.0,
      fit_intercept=True, max_iter=50, n_iter=None, n_iter_no_change=5,
      n_jobs=None, penalty=None, random_state=0, shuffle=True, tol=0.001,
      validation_fraction=0.1, verbose=0, warm_start=False)
train time: 0.254s
test time:  0.010s
accuracy:   0.918
dimensionality: 68
density: 1.000000

classification report:
              precision    recall  f1-score   support

        Peak       0.94      0.97      0.96     21223
     No peak       0.37      0.19      0.25      1657

   micro avg       0.92      0.92      0.92     22880
   macro avg       0.66      0.58      0.60     22880
weighted avg       0.90      0.92      0.91     22880

confusion matrix:
[[20684   539]
 [ 1339   318]]

================================================================================
Passive-Aggressive
________________________________________________________________________________
Training: 
PassiveAggressiveClassifier(C=1.0, average=False, class_weight=None,
              early_stopping=False, fit_intercept=True, loss='hinge',
              max_iter=50, n_iter=None, n_iter_no_change=5, n_jobs=None,
              random_state=None, shuffle=True, tol=0.001,
              validation_fraction=0.1, verbose=0, warm_start=False)
train time: 0.144s
test time:  0.019s
accuracy:   0.923
dimensionality: 68
density: 1.000000

classification report:
              precision    recall  f1-score   support

        Peak       0.93      0.99      0.96     21223
     No peak       0.29      0.05      0.08      1657

   micro avg       0.92      0.92      0.92     22880
   macro avg       0.61      0.52      0.52     22880
weighted avg       0.88      0.92      0.90     22880

confusion matrix:
[[21029   194]
 [ 1579    78]]

================================================================================
kNN
________________________________________________________________________________
Training: 
KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
           metric_params=None, n_jobs=None, n_neighbors=10, p=2,
           weights='uniform')
train time: 1.906s
test time:  7.392s
accuracy:   0.936
classification report:
              precision    recall  f1-score   support

        Peak       0.95      0.99      0.97     21223
     No peak       0.64      0.27      0.38      1657

   micro avg       0.94      0.94      0.94     22880
   macro avg       0.79      0.63      0.67     22880
weighted avg       0.92      0.94      0.92     22880

confusion matrix:
[[20974   249]
 [ 1215   442]]

================================================================================
Random forest
________________________________________________________________________________
Training: 
RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=100, n_jobs=None,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False)
train time: 9.427s
test time:  0.345s
accuracy:   0.964
classification report:
              precision    recall  f1-score   support

        Peak       0.97      0.99      0.98     21223
     No peak       0.81      0.65      0.72      1657

   micro avg       0.96      0.96      0.96     22880
   macro avg       0.89      0.82      0.85     22880
weighted avg       0.96      0.96      0.96     22880

confusion matrix:
[[20969   254]
 [  578  1079]]

================================================================================
L2 penalty
________________________________________________________________________________
Training: 
LinearSVC(C=1.0, class_weight=None, dual=False, fit_intercept=True,
     intercept_scaling=1, loss='squared_hinge', max_iter=1000,
     multi_class='ovr', penalty='l2', random_state=None, tol=0.001,
     verbose=0)
train time: 0.207s
test time:  0.011s
accuracy:   0.923
dimensionality: 68
density: 1.000000

classification report:
              precision    recall  f1-score   support

        Peak       0.93      0.99      0.96     21223
     No peak       0.26      0.03      0.06      1657

   micro avg       0.92      0.92      0.92     22880
   macro avg       0.59      0.51      0.51     22880
weighted avg       0.88      0.92      0.89     22880

confusion matrix:
[[21071   152]
 [ 1604    53]]

________________________________________________________________________________
Training: 
SGDClassifier(alpha=0.0001, average=False, class_weight=None,
       early_stopping=False, epsilon=0.1, eta0=0.0, fit_intercept=True,
       l1_ratio=0.15, learning_rate='optimal', loss='hinge', max_iter=50,
       n_iter=None, n_iter_no_change=5, n_jobs=None, penalty='l2',
       power_t=0.5, random_state=None, shuffle=True, tol=None,
       validation_fraction=0.1, verbose=0, warm_start=False)
train time: 0.529s
test time:  0.009s
accuracy:   0.921
dimensionality: 68
density: 1.000000

classification report:
              precision    recall  f1-score   support

        Peak       0.93      0.99      0.96     21223
     No peak       0.33      0.09      0.14      1657

   micro avg       0.92      0.92      0.92     22880
   macro avg       0.63      0.54      0.55     22880
weighted avg       0.89      0.92      0.90     22880

confusion matrix:
[[20921   302]
 [ 1511   146]]

================================================================================
L1 penalty
________________________________________________________________________________
Training: 
LinearSVC(C=1.0, class_weight=None, dual=False, fit_intercept=True,
     intercept_scaling=1, loss='squared_hinge', max_iter=1000,
     multi_class='ovr', penalty='l1', random_state=None, tol=0.001,
     verbose=0)
train time: 5.001s
test time:  0.008s
accuracy:   0.961
dimensionality: 68
density: 0.955882

classification report:
              precision    recall  f1-score   support

        Peak       0.97      0.99      0.98     21223
     No peak       0.78      0.64      0.71      1657

   micro avg       0.96      0.96      0.96     22880
   macro avg       0.88      0.81      0.84     22880
weighted avg       0.96      0.96      0.96     22880

confusion matrix:
[[20925   298]
 [  592  1065]]

________________________________________________________________________________
Training: 
SGDClassifier(alpha=0.0001, average=False, class_weight=None,
       early_stopping=False, epsilon=0.1, eta0=0.0, fit_intercept=True,
       l1_ratio=0.15, learning_rate='optimal', loss='hinge', max_iter=50,
       n_iter=None, n_iter_no_change=5, n_jobs=None, penalty='l1',
       power_t=0.5, random_state=None, shuffle=True, tol=None,
       validation_fraction=0.1, verbose=0, warm_start=False)
train time: 1.010s
test time:  0.010s
accuracy:   0.915
dimensionality: 68
density: 0.955882

classification report:
              precision    recall  f1-score   support

        Peak       0.93      0.98      0.96     21223
     No peak       0.29      0.12      0.17      1657

   micro avg       0.92      0.92      0.92     22880
   macro avg       0.61      0.55      0.56     22880
weighted avg       0.89      0.92      0.90     22880

confusion matrix:
[[20746   477]
 [ 1460   197]]

================================================================================
Elastic-Net penalty
________________________________________________________________________________
Training: 
SGDClassifier(alpha=0.0001, average=False, class_weight=None,
       early_stopping=False, epsilon=0.1, eta0=0.0, fit_intercept=True,
       l1_ratio=0.15, learning_rate='optimal', loss='hinge', max_iter=50,
       n_iter=None, n_iter_no_change=5, n_jobs=None, penalty='elasticnet',
       power_t=0.5, random_state=None, shuffle=True, tol=None,
       validation_fraction=0.1, verbose=0, warm_start=False)
train time: 1.023s
test time:  0.010s
accuracy:   0.906
dimensionality: 68
density: 0.955882

classification report:
              precision    recall  f1-score   support

        Peak       0.96      0.94      0.95     21223
     No peak       0.38      0.48      0.43      1657

   micro avg       0.91      0.91      0.91     22880
   macro avg       0.67      0.71      0.69     22880
weighted avg       0.92      0.91      0.91     22880

confusion matrix:
[[19935  1288]
 [  858   799]]

================================================================================
NearestCentroid (aka Rocchio classifier)
________________________________________________________________________________
Training: 
NearestCentroid(metric='euclidean', shrink_threshold=None)
train time: 0.040s
test time:  0.016s
accuracy:   0.219
classification report:
              precision    recall  f1-score   support

        Peak       0.99      0.16      0.28     21223
     No peak       0.08      0.98      0.15      1657

   micro avg       0.22      0.22      0.22     22880
   macro avg       0.54      0.57      0.21     22880
weighted avg       0.92      0.22      0.27     22880

confusion matrix:
[[ 3397 17826]
 [   34  1623]]

================================================================================
Naive Bayes
________________________________________________________________________________
Training: 
BernoulliNB(alpha=0.01, binarize=0.0, class_prior=None, fit_prior=True)
train time: 0.062s
test time:  0.025s
accuracy:   0.931
dimensionality: 68
density: 1.000000

classification report:
              precision    recall  f1-score   support

        Peak       0.95      0.98      0.96     21223
     No peak       0.54      0.34      0.41      1657

   micro avg       0.93      0.93      0.93     22880
   macro avg       0.75      0.66      0.69     22880
weighted avg       0.92      0.93      0.92     22880

confusion matrix:
[[20751   472]
 [ 1101   556]]

================================================================================
LinearSVC with L1-based feature selection
________________________________________________________________________________
Training: 
Pipeline(memory=None,
     steps=[('feature_selection', SelectFromModel(estimator=LinearSVC(C=1.0, class_weight=None, dual=False, fit_intercept=True,
     intercept_scaling=1, loss='squared_hinge', max_iter=1000,
     multi_class='ovr', penalty='l1', random_state=None, tol=0.001,
     verbose=0),
        max_features=None, no...ax_iter=1000,
     multi_class='ovr', penalty='l2', random_state=None, tol=0.0001,
     verbose=0))])
train time: 12.753s
test time:  0.024s
accuracy:   0.933
classification report:
              precision    recall  f1-score   support

        Peak       0.93      1.00      0.97     21223
     No peak       0.82      0.10      0.18      1657

   micro avg       0.93      0.93      0.93     22880
   macro avg       0.88      0.55      0.57     22880
weighted avg       0.93      0.93      0.91     22880

confusion matrix:
[[21187    36]
 [ 1491   166]]

Model improvement

Cross-validation

In [34]:
Image(url= "figures/grid_search_workflow.png", width=700, height=700)
Out[34]:
In [35]:
Image(url= "figures/grid_search_cross_validation.png", width=700, height=700)
Out[35]:

Feature selection

  • Regularisation
  • Univariate
  • Recursive
  • Select from model
  • Boruta
In [36]:
# Create the RFE object and compute a cross-validated score.
model = RandomForestClassifier(n_estimators=300, min_samples_leaf=5, random_state=0, n_jobs=-1, oob_score=True)
# The "accuracy" scoring is proportional to the number of correct
# classifications
rfecv = RFECV(estimator=model, step=1, cv=StratifiedKFold(2),
              scoring='recall')
rfecv.fit(X_train, y_train)

print("Optimal number of features : %d" % rfecv.n_features_)
Optimal number of features : 55
In [37]:
# Plot number of features VS. cross-validation scores
plt.figure()
plt.xlabel("Number of features selected")
plt.ylabel("Cross validation recall score")
plt.plot(range(1, len(rfecv.grid_scores_) + 1), rfecv.grid_scores_)
plt.show()
In [38]:
with open('best_features.pkl','wb') as f:
    pickle.dump(rfecv,f)

Random Forest Classifier

In [39]:
Image(url= "figures/RF_simplified.png", width=700, height=700)
Out[39]:

Hyperparameter Optimisation

In [40]:
from sklearn.model_selection import KFold
from sklearn import pipeline
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score

parameters = {'n_estimators': (100, 300, 500),
              'min_samples_leaf': (1, 5, 10),
              'class_weight': (None,'balanced', 'balanced_subsample') }



gs_clf = GridSearchCV(model, parameters, scoring = 'recall', n_jobs=-1, cv=5, refit=True)
gs_clf = gs_clf.fit(X_train, y_train)
with open('gridCV_optimised.pkl','wb') as f:
    pickle.dump(gs_clf,f)
In [41]:
with open('gridCV_optimised.pkl','rb') as f:
    gs_clf = pickle.load(f)
In [42]:
print('Recall of the best model:',gs_clf.best_score_)
Recall of the best model: 0.8708588587954249
In [43]:
print('Best parameters:',gs_clf.best_params_)
Best parameters: {'class_weight': 'balanced_subsample', 'min_samples_leaf': 10, 'n_estimators': 300}

Performance Analysis

In [44]:
# Train baseline RF
bl_rf = RandomForestClassifier(n_estimators=100)
bl_rf.fit(X_train, y_train)
bl_prob = bl_rf.predict_proba(X_test)
y_bl = bl_rf.predict(X_test)
# Train optimal RF
opt_rf = RandomForestClassifier(n_estimators=100,min_samples_leaf=10,class_weight='balanced')
opt_rf.fit(X_train, y_train)
opt_prob = opt_rf.predict_proba(X_test)
y_opt = opt_rf.predict(X_test)

# Compare curves

ROC

In [45]:
Image(url= "figures/TPRvsFPR.PNG", width=700, height=700)
Out[45]:
In [46]:
fpr, tpr, th = roc_curve(y_test, opt_prob[:,1], pos_label=True, sample_weight=None, drop_intermediate=False)
In [47]:
# Create a trace

trace1 = go.Scatter(
    x = [0,1],
    y = [0,1],
    line = dict(dash='dash'),
    showlegend=False

)


fpr, tpr, th = roc_curve(y_test, opt_prob[:,1], pos_label=True, sample_weight=None, drop_intermediate=False)
trace2 = go.Scatter(
    x = fpr,
    y = tpr,
    hovertext=th,
    name = 'ROC curve'
)

layout = go.Layout(title='Receiver Operating Characteristic',
                  xaxis=dict(title='False positive rate'),
                  yaxis=dict(title='True positive rate'))

data = [trace1,trace2]
fig = go.Figure(data=data, layout=layout)
iplot(fig,filename='basic-bar')