RIPTA

Last updated: August 18th, 20192019-08-18Project preview
In [3]:
import sqlite3
from pathlib import Path
import pandas as pd
import os
import descartes
import geopandas as gpd
import matplotlib.pyplot as plt
from shapely.geometry import Point, Polygon
In [4]:
%matplotlib inline
In [5]:
route_timepoints = pd.read_csv('StaticData/route_timepoints.txt')
routes = pd.read_csv('StaticData/routes.txt')
shapes = pd.read_csv('StaticData/shapes.txt')
stop_times = pd.read_csv('StaticData/stop_times.txt')
stops = pd.read_csv('StaticData/stops.txt')
trips = pd.read_csv('StaticData/trips.txt')
In [14]:
conn = sqlite3.connect('RIPTA.db')
trip_updates_df = pd.read_sql_query('SELECT * FROM trip_updates_temp',conn)
In [15]:
trip_updates_df.head()
Out[15]:
id update_timestamp timestamp trip_id stop_id delay
0 1 1564775906 1564775887 2854283 1480 600
1 2 1564775906 1564775887 2854283 1430 600
2 3 1564775906 1564775887 2854283 1365 600
3 4 1564775906 1564775887 2854283 1345 600
4 5 1564775906 1564775887 2854283 1382 600
In [11]:
vehicle_positions_df = pd.read_sql_query('SELECT * FROM vehicle_positions',conn)
In [12]:
vehicle_positions_df.head()
Out[12]:
id update_timestamp trip_id route_id trip_start_time trip_start_date position_lat position_lon bearing speed current_status stop_id timestamp vehicle_id
0 1 1564775906 2854283 63 15:22:00 20190802 41.498170 -71.307450 195 8.49376 2 1480 1564775822 1005
1 2 1564775906 2854254 63 15:15:00 20190802 41.526290 -71.297005 15 10.28192 2 2415 1564775864 1002
2 3 1564775906 2854255 63 15:45:00 20190802 41.514000 -71.309850 345 9.38784 2 56765 1564775870 543
3 4 1564775906 2854178 14 15:37:00 20190802 41.493565 -71.437940 360 19.66976 2 1875 1564775851 547
4 5 1564775906 2854170 14 14:29:00 20190802 41.762634 -71.431335 15 9.83488 2 72170 1564775870 538
In [13]:
vehicle_positions_df.shape
Out[13]:
(620, 14)
In [16]:
trip_updates_df.shape
Out[16]:
(7672, 6)
In [17]:
###CHECK FOR DUPLICATES IN TRIP_UPDATES
trip_updates_df.duplicated(subset=['timestamp','trip_id','stop_id'],keep=False).value_counts()
Out[17]:
False    7672
dtype: int64
In [18]:
###CHECK FOR DUPLICATES IN VEHICLE POSITIONS
vehicle_positions_df.duplicated(subset=['timestamp','position_lat','position_lon'],keep=False).value_counts()
Out[18]:
False    616
True       4
dtype: int64
In [20]:
vehicle_positions_df[vehicle_positions_df.duplicated(subset=['timestamp','position_lat','position_lon'],keep=False)==True]
Out[20]:
id update_timestamp trip_id route_id trip_start_time trip_start_date position_lat position_lon bearing speed current_status stop_id timestamp vehicle_id
109 110 1564775906 2865119 78 15:22:00 20190802 41.824028 -71.40659 330 6.25856 2 16525 1564775862 520
261 262 1564776206 2865119 78 15:22:00 20190802 41.824028 -71.40659 330 6.25856 2 15150 1564775862 520
415 416 1564776506 2865119 78 15:22:00 20190802 41.824028 -71.40659 330 6.25856 2 72650 1564775862 520
572 573 1564776806 2865119 78 15:22:00 20190802 41.824028 -71.40659 330 6.25856 2 58815 1564775862 520
In [21]:
vehicle_positions_df['update_timestamp'].value_counts()
Out[21]:
1564776806    158
1564776506    156
1564775906    154
1564776206    152
Name: update_timestamp, dtype: int64
In [22]:
trip_updates_df['update_timestamp'].value_counts()
Out[22]:
1564776806    1985
1564776506    1917
1564775906    1907
1564776206    1863
Name: update_timestamp, dtype: int64
In [23]:
###ROUNDING TIMESTAMPS TO THE NEAREST 100 SECS TO INCREASE LIKELIHOOD OF MATCH BETWEEN TRIP_UPDATE TIMESTAMP 
###AND VEHICLE_POSITION TIMESTAMP
vehicle_positions_df['timestamp_rounded'] = round(vehicle_positions_df['timestamp'],-2)
vehicle_positions_df.head()
Out[23]:
id update_timestamp trip_id route_id trip_start_time trip_start_date position_lat position_lon bearing speed current_status stop_id timestamp vehicle_id timestamp_rounded
0 1 1564775906 2854283 63 15:22:00 20190802 41.498170 -71.307450 195 8.49376 2 1480 1564775822 1005 1564775800
1 2 1564775906 2854254 63 15:15:00 20190802 41.526290 -71.297005 15 10.28192 2 2415 1564775864 1002 1564775900
2 3 1564775906 2854255 63 15:45:00 20190802 41.514000 -71.309850 345 9.38784 2 56765 1564775870 543 1564775900
3 4 1564775906 2854178 14 15:37:00 20190802 41.493565 -71.437940 360 19.66976 2 1875 1564775851 547 1564775900
4 5 1564775906 2854170 14 14:29:00 20190802 41.762634 -71.431335 15 9.83488 2 72170 1564775870 538 1564775900
In [24]:
trip_updates_df['timestamp_rounded'] = round(trip_updates_df['timestamp'],-2)
trip_updates_df.head()
Out[24]:
id update_timestamp timestamp trip_id stop_id delay timestamp_rounded
0 1 1564775906 1564775887 2854283 1480 600 1564775900
1 2 1564775906 1564775887 2854283 1430 600 1564775900
2 3 1564775906 1564775887 2854283 1365 600 1564775900
3 4 1564775906 1564775887 2854283 1345 600 1564775900
4 5 1564775906 1564775887 2854283 1382 600 1564775900
In [25]:
###MATCHING VEHICLE POSITIONS WITH THEIR RESPECTIVE DELAY DATA
results = vehicle_positions_df.merge(trip_updates_df[['timestamp_rounded','trip_id','delay','stop_id']],
                                     on=['timestamp_rounded','trip_id','stop_id'],how='left')
In [26]:
results.duplicated(keep=False).value_counts()
Out[26]:
False    620
dtype: int64
In [27]:
results.head()
Out[27]:
id update_timestamp trip_id route_id trip_start_time trip_start_date position_lat position_lon bearing speed current_status stop_id timestamp vehicle_id timestamp_rounded delay
0 1 1564775906 2854283 63 15:22:00 20190802 41.498170 -71.307450 195 8.49376 2 1480 1564775822 1005 1564775800 NaN
1 2 1564775906 2854254 63 15:15:00 20190802 41.526290 -71.297005 15 10.28192 2 2415 1564775864 1002 1564775900 660.0
2 3 1564775906 2854255 63 15:45:00 20190802 41.514000 -71.309850 345 9.38784 2 56765 1564775870 543 1564775900 -60.0
3 4 1564775906 2854178 14 15:37:00 20190802 41.493565 -71.437940 360 19.66976 2 1875 1564775851 547 1564775900 180.0
4 5 1564775906 2854170 14 14:29:00 20190802 41.762634 -71.431335 15 9.83488 2 72170 1564775870 538 1564775900 960.0
In [28]:
###NUMBER OF BUSES THAT RAN ON TIME
results['delay'].isna().sum()
Out[28]:
397
In [29]:
results['delay']=results['delay'].fillna(0)
results.head()
Out[29]:
id update_timestamp trip_id route_id trip_start_time trip_start_date position_lat position_lon bearing speed current_status stop_id timestamp vehicle_id timestamp_rounded delay
0 1 1564775906 2854283 63 15:22:00 20190802 41.498170 -71.307450 195 8.49376 2 1480 1564775822 1005 1564775800 0.0
1 2 1564775906 2854254 63 15:15:00 20190802 41.526290 -71.297005 15 10.28192 2 2415 1564775864 1002 1564775900 660.0
2 3 1564775906 2854255 63 15:45:00 20190802 41.514000 -71.309850 345 9.38784 2 56765 1564775870 543 1564775900 -60.0
3 4 1564775906 2854178 14 15:37:00 20190802 41.493565 -71.437940 360 19.66976 2 1875 1564775851 547 1564775900 180.0
4 5 1564775906 2854170 14 14:29:00 20190802 41.762634 -71.431335 15 9.83488 2 72170 1564775870 538 1564775900 960.0
In [30]:
pvd_map = gpd.read_file('StaticData/pvdmap/geo_export_e37ca5c5-fc9d-45b0-b497-72b028d29c57.shp')
ri_map = gpd.read_file('StaticData/rimap/tl_2016_44_tract.shp')
In [31]:
geometry = [Point(xy) for xy in zip(results['position_lon'],results['position_lat'])]
geometry[:3]
Out[31]:
[<shapely.geometry.point.Point at 0x7f514a1f26a0>,
 <shapely.geometry.point.Point at 0x7f514a1f2748>,
 <shapely.geometry.point.Point at 0x7f514a1f2b00>]
In [32]:
geometry_stops = [Point(xy) for xy in zip(stops['stop_lon'],stops['stop_lat'])]
geometry_stops[:3]
Out[32]:
[<shapely.geometry.point.Point at 0x7f514a223240>,
 <shapely.geometry.point.Point at 0x7f514a2237f0>,
 <shapely.geometry.point.Point at 0x7f514a2239e8>]
In [33]:
crs = {'init':'epsg:4326'}
geo_df = gpd.GeoDataFrame(results,crs=crs,geometry=geometry)
geo_df.head()
Out[33]:
id update_timestamp trip_id route_id trip_start_time trip_start_date position_lat position_lon bearing speed current_status stop_id timestamp vehicle_id timestamp_rounded delay geometry
0 1 1564775906 2854283 63 15:22:00 20190802 41.498170 -71.307450 195 8.49376 2 1480 1564775822 1005 1564775800 0.0 POINT (-71.30745 41.49817)
1 2 1564775906 2854254 63 15:15:00 20190802 41.526290 -71.297005 15 10.28192 2 2415 1564775864 1002 1564775900 660.0 POINT (-71.2970047 41.52629)
2 3 1564775906 2854255 63 15:45:00 20190802 41.514000 -71.309850 345 9.38784 2 56765 1564775870 543 1564775900 -60.0 POINT (-71.30985 41.514)
3 4 1564775906 2854178 14 15:37:00 20190802 41.493565 -71.437940 360 19.66976 2 1875 1564775851 547 1564775900 180.0 POINT (-71.43794 41.4935646)
4 5 1564775906 2854170 14 14:29:00 20190802 41.762634 -71.431335 15 9.83488 2 72170 1564775870 538 1564775900 960.0 POINT (-71.43133539999999 41.7626343)
In [34]:
crs = {'init':'epsg:4326'}
geo_stops_df = gpd.GeoDataFrame(stops,crs=crs,geometry=geometry_stops)
geo_stops_df.head()
Out[34]:
stop_id stop_code stop_name stop_desc stop_lat stop_lon zone_id stop_url location_type parent_station stop_associated_place geometry
0 5 NaN WESTERLY TOWN HALL NaN 41.377000 -71.829771 NaN NaN 0 NaN NaN POINT (-71.82977099999999 41.377)
1 10 NaN GROVE NS WILCOX NaN 41.378328 -71.826680 NaN NaN 0 NaN NaN POINT (-71.82668000000001 41.378328)
2 20 NaN WESTERLY RAILROAD STATION NaN 41.381108 -71.829808 NaN NaN 0 NaN west POINT (-71.829808 41.381108)
3 25 NaN HIGH FS GROVE NaN 41.381039 -71.825720 NaN NaN 0 NaN NaN POINT (-71.82571999999999 41.381039)
4 50 NaN BOON OPP PERKINS NaN 41.424357 -71.460620 NaN NaN 0 NaN NaN POINT (-71.46061999999999 41.424357)
In [35]:
fig,ax = plt.subplots(figsize=(15,15))
ri_map.plot(ax=ax, color='grey')
geo_stops_df.plot(ax=ax,markersize=0.1,color='white',marker='x',label='stops')
geo_df[geo_df['delay']<0].plot(ax=ax,markersize=5,color='green',marker='o',label='early')
geo_df[geo_df['delay']==0].plot(ax=ax,markersize=5,color='black',marker='o',label='on time')
geo_df[geo_df['delay']<300][geo_df['delay']>0].plot(ax=ax,markersize=5,color='yellow',marker='o',label='<5 min late')
geo_df[geo_df['delay']<600][geo_df['delay']>300].plot(ax=ax,markersize=5,color='orange',marker='o',label='<10 min late')
geo_df[geo_df['delay']>600].plot(ax=ax,markersize=5,color='red',marker='o',label='>10 min late')
legend = plt.legend(prop={'size':15})
frame = legend.get_frame()
frame.set_facecolor('grey')
plt.title('Bus Locations and stops in RI recorded on Aug 2, 2019',fontsize=22)
plt.savefig('RI_RIPTA_Aug2')
/usr/local/lib/python3.6/site-packages/geopandas/geodataframe.py:471: UserWarning: Boolean Series key will be reindexed to match DataFrame index.
  result = super(GeoDataFrame, self).__getitem__(key)
In [36]:
###IDENTIFYING (ROUGHLY) WHICH BUSES AND STOPS ARE IN PROVIDENCE (MAIN METRO AREA)
def inpvd(lon,lat):
    if -71.472716 < lon < -71.365464:
        if 41.787151 < lat < 41.858831:
            return True
    return False

geo_df['inpvd'] = [inpvd(x,y) for x,y in zip(geo_df['position_lon'],geo_df['position_lat'])]
geo_stops_df['indpvd'] = [inpvd(x,y) for x,y in zip(geo_stops_df['stop_lon'],geo_stops_df['stop_lat'])]
In [37]:
geo_df_pvd = geo_df[geo_df['inpvd']==True]
fig,ax = plt.subplots(figsize=(15,15))
pvd_map.plot(ax=ax, color='grey')
geo_stops_df[geo_stops_df['indpvd']==True].plot(ax=ax,markersize=0.5,color='white',marker='x',label='stops')
geo_df_pvd[geo_df['delay']<0].plot(ax=ax,markersize=5,color='green',marker='o',label='early')
geo_df_pvd[geo_df['delay']==0].plot(ax=ax,markersize=5,color='black',marker='o',label='on time')
geo_df_pvd[geo_df['delay']<300][geo_df['delay']>0].plot(ax=ax,markersize=5,color='yellow',marker='o',label='<5 min late')
geo_df_pvd[geo_df['delay']<600][geo_df['delay']>300].plot(ax=ax,markersize=5,color='orange',marker='o',label='<10 min late')
geo_df_pvd[geo_df['delay']>600].plot(ax=ax,markersize=5,color='red',marker='o',label='>10 min late')
legend = plt.legend(prop={'size':15})
frame = legend.get_frame()
frame.set_facecolor('grey')
plt.title('Bus Locations and stops in Providence recorded on Aug 2, 2019',fontsize=22)
plt.savefig('Providence_RIPTA_aug2')
In [38]:
inpvd_counts = []
inri_counts = []

inpvd_counts.append(geo_df[geo_df['inpvd']==True][geo_df['delay']<0]['inpvd'].count())
inpvd_counts.append(geo_df[geo_df['inpvd']==True][geo_df['delay']==0]['inpvd'].count())
inpvd_counts.append(geo_df[geo_df['inpvd']==True][geo_df['delay']>0][geo_df['delay']<300]['inpvd'].count())
inpvd_counts.append(geo_df[geo_df['inpvd']==True][geo_df['delay']>300][geo_df['delay']<600]['inpvd'].count())
inpvd_counts.append(geo_df[geo_df['inpvd']==True][geo_df['delay']>600]['inpvd'].count())
inri_counts.append(geo_df[geo_df['inpvd']==False][geo_df['delay']<0]['inpvd'].count())
inri_counts.append(geo_df[geo_df['inpvd']==False][geo_df['delay']==0]['inpvd'].count())
inri_counts.append(geo_df[geo_df['inpvd']==False][geo_df['delay']>0][geo_df['delay']<300]['inpvd'].count())
inri_counts.append(geo_df[geo_df['inpvd']==False][geo_df['delay']>300][geo_df['delay']<600]['inpvd'].count())
inri_counts.append(geo_df[geo_df['inpvd']==False][geo_df['delay']>600]['inpvd'].count())

inpvd_counts,inri_counts    
Out[38]:
([24, 157, 54, 18, 17], [8, 240, 38, 24, 19])
In [39]:
import numpy as np

fig,ax = plt.subplots(figsize=(10,10))
ind = np.arange(len(inri_counts))
width = 0.35
plt.bar(ind,inpvd_counts,width,label='delay_counts_in_pvd')
plt.bar(ind+width,inri_counts,width,label='delay_counts_not_in_pvd')
plt.ylabel('Count_in_range')
plt.xlabel('Range')
plt.xticks(ind+width/2,('early','on time','0 to 5 min late','5 to 10 min late','>10 min late'),rotation=30)
plt.legend(loc='best')
plt.title('Delays in vs out of Providence',fontsize=22)
plt.savefig('RIPTA_delays_bar_Aug2')
In [46]:
conn = sqlite3.connect('RIPTA.db')
table_info = pd.read_sql_query('SELECT * FROM dbstat',conn)
table_info.head()
Out[46]:
name path pageno pagetype ncell payload unused mx_payload pgoffset pgsize
0 sqlite_master / 1 leaf 5 1574 2390 503 0 4096
1 sqlite_sequence / 3 leaf 2 44 4036 22 8192 4096
2 trip_updates / 2 leaf 0 0 4088 0 4096 4096
3 trip_updates_temp / 5 internal 50 0 3684 0 16384 4096
4 trip_updates_temp /000/ 9 leaf 156 3432 3 23 32768 4096
In [47]:
trip_updates_size = table_info[table_info['name']=='trip_updates_temp']['payload'].sum()
trip_updates_size
Out[47]:
167680
In [62]:
stop_times['arrival_time'].sort_values(ascending=False).head(10)
Out[62]:
210082    25:48:00
210081    25:47:00
210080    25:46:00
210078    25:46:00
210079    25:46:00
210077    25:45:00
210076    25:37:00
210075    25:37:00
24753     25:37:00
210074    25:36:00
Name: arrival_time, dtype: object
In [ ]:
 
Notebooks AI
Notebooks AI Profile20060