In [3]:
import sqlite3
from pathlib import Path
import pandas as pd
import os
import descartes
import geopandas as gpd
import matplotlib.pyplot as plt
from shapely.geometry import Point, Polygon
In [4]:
%matplotlib inline
In [5]:
route_timepoints = pd.read_csv('StaticData/route_timepoints.txt')
routes = pd.read_csv('StaticData/routes.txt')
shapes = pd.read_csv('StaticData/shapes.txt')
stop_times = pd.read_csv('StaticData/stop_times.txt')
stops = pd.read_csv('StaticData/stops.txt')
trips = pd.read_csv('StaticData/trips.txt')
In [14]:
conn = sqlite3.connect('RIPTA.db')
trip_updates_df = pd.read_sql_query('SELECT * FROM trip_updates_temp',conn)
In [15]:
trip_updates_df.head()
Out[15]:
In [11]:
vehicle_positions_df = pd.read_sql_query('SELECT * FROM vehicle_positions',conn)
In [12]:
vehicle_positions_df.head()
Out[12]:
In [13]:
vehicle_positions_df.shape
Out[13]:
In [16]:
trip_updates_df.shape
Out[16]:
In [17]:
###CHECK FOR DUPLICATES IN TRIP_UPDATES
trip_updates_df.duplicated(subset=['timestamp','trip_id','stop_id'],keep=False).value_counts()
Out[17]:
In [18]:
###CHECK FOR DUPLICATES IN VEHICLE POSITIONS
vehicle_positions_df.duplicated(subset=['timestamp','position_lat','position_lon'],keep=False).value_counts()
Out[18]:
In [20]:
vehicle_positions_df[vehicle_positions_df.duplicated(subset=['timestamp','position_lat',
'position_lon'],keep=False)==True]
Out[20]:
In [21]:
vehicle_positions_df['update_timestamp'].value_counts()
Out[21]:
In [22]:
trip_updates_df['update_timestamp'].value_counts()
Out[22]:
In [23]:
###ROUNDING TIMESTAMPS TO THE NEAREST 100 SECS TO INCREASE LIKELIHOOD OF MATCH BETWEEN TRIP_UPDATE TIMESTAMP
###AND VEHICLE_POSITION TIMESTAMP
vehicle_positions_df['timestamp_rounded'] = round(vehicle_positions_df['timestamp'],-2)
vehicle_positions_df.head()
Out[23]:
In [24]:
trip_updates_df['timestamp_rounded'] = round(trip_updates_df['timestamp'],-2)
trip_updates_df.head()
Out[24]:
In [25]:
###MATCHING VEHICLE POSITIONS WITH THEIR RESPECTIVE DELAY DATA
results = vehicle_positions_df.merge(trip_updates_df[['timestamp_rounded','trip_id','delay','stop_id']],
on=['timestamp_rounded','trip_id','stop_id'],how='left')
In [26]:
results.duplicated(keep=False).value_counts()
Out[26]:
In [27]:
results.head()
Out[27]:
In [28]:
###NUMBER OF BUSES THAT RAN ON TIME
results['delay'].isna().sum()
Out[28]:
In [29]:
results['delay']=results['delay'].fillna(0)
results.head()
Out[29]:
In [30]:
pvd_map = gpd.read_file('StaticData/pvdmap/geo_export_e37ca5c5-fc9d-45b0-b497-72b028d29c57.shp')
ri_map = gpd.read_file('StaticData/rimap/tl_2016_44_tract.shp')
In [31]:
geometry = [Point(xy) for xy in zip(results['position_lon'],results['position_lat'])]
geometry[:3]
Out[31]:
In [32]:
geometry_stops = [Point(xy) for xy in zip(stops['stop_lon'],stops['stop_lat'])]
geometry_stops[:3]
Out[32]:
In [33]:
crs = {'init':'epsg:4326'}
geo_df = gpd.GeoDataFrame(results,crs=crs,geometry=geometry)
geo_df.head()
Out[33]:
In [34]:
crs = {'init':'epsg:4326'}
geo_stops_df = gpd.GeoDataFrame(stops,crs=crs,geometry=geometry_stops)
geo_stops_df.head()
Out[34]:
In [35]:
fig,ax = plt.subplots(figsize=(15,15))
ri_map.plot(ax=ax, color='grey')
geo_stops_df.plot(ax=ax,markersize=0.1,color='white',marker='x',label='stops')
geo_df[geo_df['delay']<0].plot(ax=ax,markersize=5,color='green',marker='o',label='early')
geo_df[geo_df['delay']==0].plot(ax=ax,markersize=5,color='black',marker='o',label='on time')
geo_df[geo_df['delay']<300][geo_df['delay']>0].plot(ax=ax,markersize=5,color='yellow',marker='o',label='<5 min late')
geo_df[geo_df['delay']<600][geo_df['delay']>300].plot(ax=ax,markersize=5,color='orange',marker='o',label='<10 min late')
geo_df[geo_df['delay']>600].plot(ax=ax,markersize=5,color='red',marker='o',label='>10 min late')
legend = plt.legend(prop={'size':15})
frame = legend.get_frame()
frame.set_facecolor('grey')
plt.title('Bus Locations and stops in RI recorded on Aug 2, 2019',fontsize=22)
plt.savefig('RI_RIPTA_Aug2')
In [36]:
###IDENTIFYING (ROUGHLY) WHICH BUSES AND STOPS ARE IN PROVIDENCE (MAIN METRO AREA)
def inpvd(lon,lat):
if -71.472716 < lon < -71.365464:
if 41.787151 < lat < 41.858831:
return True
return False
geo_df['inpvd'] = [inpvd(x,y) for x,y in zip(geo_df['position_lon'],geo_df['position_lat'])]
geo_stops_df['indpvd'] = [inpvd(x,y) for x,y in zip(geo_stops_df['stop_lon'],geo_stops_df['stop_lat'])]
In [37]:
geo_df_pvd = geo_df[geo_df['inpvd']==True]
fig,ax = plt.subplots(figsize=(15,15))
pvd_map.plot(ax=ax, color='grey')
geo_stops_df[geo_stops_df['indpvd']==True].plot(ax=ax,markersize=0.5,color='white',marker='x',label='stops')
geo_df_pvd[geo_df['delay']<0].plot(ax=ax,markersize=5,color='green',marker='o',label='early')
geo_df_pvd[geo_df['delay']==0].plot(ax=ax,markersize=5,color='black',marker='o',label='on time')
geo_df_pvd[geo_df['delay']<300][geo_df['delay']>0].plot(ax=ax,markersize=5,color='yellow',marker='o',label='<5 min late')
geo_df_pvd[geo_df['delay']<600][geo_df['delay']>300].plot(ax=ax,markersize=5,color='orange',marker='o',label='<10 min late')
geo_df_pvd[geo_df['delay']>600].plot(ax=ax,markersize=5,color='red',marker='o',label='>10 min late')
legend = plt.legend(prop={'size':15})
frame = legend.get_frame()
frame.set_facecolor('grey')
plt.title('Bus Locations and stops in Providence recorded on Aug 2, 2019',fontsize=22)
plt.savefig('Providence_RIPTA_aug2')
In [38]:
inpvd_counts = []
inri_counts = []
inpvd_counts.append(geo_df[geo_df['inpvd']==True][geo_df['delay']<0]['inpvd'].count())
inpvd_counts.append(geo_df[geo_df['inpvd']==True][geo_df['delay']==0]['inpvd'].count())
inpvd_counts.append(geo_df[geo_df['inpvd']==True][geo_df['delay']>0][geo_df['delay']<300]['inpvd'].count())
inpvd_counts.append(geo_df[geo_df['inpvd']==True][geo_df['delay']>300][geo_df['delay']<600]['inpvd'].count())
inpvd_counts.append(geo_df[geo_df['inpvd']==True][geo_df['delay']>600]['inpvd'].count())
inri_counts.append(geo_df[geo_df['inpvd']==False][geo_df['delay']<0]['inpvd'].count())
inri_counts.append(geo_df[geo_df['inpvd']==False][geo_df['delay']==0]['inpvd'].count())
inri_counts.append(geo_df[geo_df['inpvd']==False][geo_df['delay']>0][geo_df['delay']<300]['inpvd'].count())
inri_counts.append(geo_df[geo_df['inpvd']==False][geo_df['delay']>300][geo_df['delay']<600]['inpvd'].count())
inri_counts.append(geo_df[geo_df['inpvd']==False][geo_df['delay']>600]['inpvd'].count())
inpvd_counts,inri_counts
Out[38]:
In [39]:
import numpy as np
fig,ax = plt.subplots(figsize=(10,10))
ind = np.arange(len(inri_counts))
width = 0.35
plt.bar(ind,inpvd_counts,width,label='delay_counts_in_pvd')
plt.bar(ind+width,inri_counts,width,label='delay_counts_not_in_pvd')
plt.ylabel('Count_in_range')
plt.xlabel('Range')
plt.xticks(ind+width/2,('early','on time','0 to 5 min late','5 to 10 min late','>10 min late'),rotation=30)
plt.legend(loc='best')
plt.title('Delays in vs out of Providence',fontsize=22)
plt.savefig('RIPTA_delays_bar_Aug2')
In [46]:
conn = sqlite3.connect('RIPTA.db')
table_info = pd.read_sql_query('SELECT * FROM dbstat',conn)
table_info.head()
Out[46]:
In [47]:
trip_updates_size = table_info[table_info['name']=='trip_updates_temp']['payload'].sum()
trip_updates_size
Out[47]:
In [62]:
stop_times['arrival_time'].sort_values(ascending=False).head(10)
Out[62]:
In [ ]: