import ibis
# Unlike R, Ibis doesn't have non-standard evaluation. Instaed we use this, which is
# probably better; all these years & I still don't understand closures in R.
from ibis import _
# This provides a suite of column selection helpers like tidyselect.
import ibis.selectors as s
# NOTE: By default, Ibis Tables print just the schema. This makes them print a preview
# of the data, like Data Frames in R or Pandas do.
= True
ibis.options.interactive
# NOTE: The Polars backend can't do window functions. You'll get an error when calling
# `rank()` or `row_number()`.
= ibis.duckdb.connect() con
Every holiday season, the Federal Transit Administration gifts us a dataset: How much each transit agency’s vehicles broke down during the prior calendar year. It’s 2024. We now have 2023 data. Time to prove once again that New Jersey Transit isn’t the worst in the nation, and therefore everything is fine and you should all stop whining or voting.
But this year, I discovered Ibis, yet another Python library for working with data. If you paid attention, you know I’ve only used R–but exclusively its Tidyverse “dialect”–on this blog to date. I’ve been meaning to write about why I avoid Pandas (a.k.a. Base R for Python), or any other data analysis toolset on the Wall of Shame (base R, PySpark). But unlike those, Ibis is The Way. At long last I’m convinced to migrate away from my beloved Tidyverse + RStudio stack.
In this post I’ll demonstrate how to clean breakdown data from the FTA’s National Transit Database (NTD) using Ibis. I also recommend Visual Studio Code, its Data Wrangler extension, and Jupyter (only for interactive work), which together provide an RStudio-like experience.
While this stack has its hiccups (noted below)—Tidyverse + RStudio still provide the simplest, most ergonomic toolset for exploratory data analysis—Ibis is quite nice and ready for prime time. I bet it’ll one day overtake the others.
Stay tuned for my 2023 Transit Breakdown Rankings in a later post. Now on with the code.
# Source: https://www.transit.dot.gov/ntd/data-product/2023-breakdowns
# Direct download: https://data.transportation.gov/resource/amkt-4ehs.csv?$query=SELECT%0A%20%20%60agency%60%2C%0A%20%20%60city%60%2C%0A%20%20%60state%60%2C%0A%20%20%60ntd_id%60%2C%0A%20%20%60organization_type%60%2C%0A%20%20%60reporter_type%60%2C%0A%20%20%60report_year%60%2C%0A%20%20%60uace_code%60%2C%0A%20%20%60uza_name%60%2C%0A%20%20%60primary_uza_population%60%2C%0A%20%20%60agency_voms%60%2C%0A%20%20%60mode%60%2C%0A%20%20%60mode_name%60%2C%0A%20%20%60type_of_service%60%2C%0A%20%20%60mode_voms%60%2C%0A%20%20%60major_mechanical_failures%60%2C%0A%20%20%60major_mechanical_failures_1%60%2C%0A%20%20%60other_mechanical_failures%60%2C%0A%20%20%60other_mechanical_failures_1%60%2C%0A%20%20%60total_mechanical_failures%60%2C%0A%20%20%60total_mechanical_failures_1%60%2C%0A%20%20%60vehicle_passenger_car_miles%60%2C%0A%20%20%60vehicle_passenger_car_miles_1%60%2C%0A%20%20%60vehicle_passenger_car_revenue%60%2C%0A%20%20%60vehicle_passenger_car_miles_2%60%2C%0A%20%20%60train_miles%60%2C%0A%20%20%60train_miles_questionable%60%2C%0A%20%20%60train_revenue_miles%60%2C%0A%20%20%60train_revenue_miles_1%60%0AWHERE%20caseless_one_of(%60report_year%60%2C%20%222023%22)
# The NTD for 2022 & onward are on the Federal gov't's decent open data website with a
# REST API that's actually nice to work with. However, the community-made Python API
# client, sodapy, is no longer maintained: https://pypi.org/project/sodapy/
# It still works, but it returns all fields as strings.
= con.read_csv("amkt-4ehs.csv")
raw raw.schema()
ibis.Schema {
agency string
city string
state string
ntd_id string
organization_type string
reporter_type string
report_year int64
uace_code int64
uza_name string
primary_uza_population int64
agency_voms int64
mode string
mode_name string
type_of_service string
mode_voms int64
major_mechanical_failures int64
major_mechanical_failures_1 string
other_mechanical_failures int64
other_mechanical_failures_1 string
total_mechanical_failures int64
total_mechanical_failures_1 string
vehicle_passenger_car_miles int64
vehicle_passenger_car_miles_1 string
vehicle_passenger_car_revenue int64
vehicle_passenger_car_miles_2 string
train_miles int64
train_miles_questionable string
train_revenue_miles int64
train_revenue_miles_1 string
}
Remember: Each row is an agency-mode-service type. For example:
- Agency: NJ Transit
- Modes:
- Commuter Bus (MB)
- Service types:
- Directly Operated (DO)
- Purchased Transit (PT)
- Service types:
- Commuter Rail (CR)
- Service types:
- DO
- Service types:
- Light Rail (LR)
- Service types:
- DO: River Line, Newark LR (data combined)
- PT: Hudson-Bergen LR
- Service types:
- Commuter Bus (MB)
- Modes:
- Agency: MTA Metro-North
- …
# So these are the primary keys.
= [
keys "ntd_id",
"mode",
"type_of_service",
]
= (
stg filter(
raw.
("CR", "HR", "LR", "MB"]),
_.mode.isin([> 0,
_.total_mechanical_failures # NOTE: Ibis doesn't return nulls for != predicates, like dplyr or SQL do.
(
(_.total_mechanical_failures_1.isnull())| (_.total_mechanical_failures_1 != "Q")
),> 0,
_.vehicle_passenger_car_miles
(
(_.vehicle_passenger_car_miles_2.isnull())| (_.vehicle_passenger_car_miles_2 != "Q")
),
)
)
.group_by(_.mode)
.mutate(= _.agency.re_replace(".*dba: ", ""),
agency =_.vehicle_passenger_car_miles / _.total_mechanical_failures
breakdown_rate
)# NOTE: In Ibis can't create a column from another column created within the same
# `mutate()` call, like dplyr can.
=ibis.rank().over(group_by=_.mode, order_by=_.breakdown_rate.desc()))
.mutate(rank# NOTE: Ranks start at 0!
=_.rank + 1)
.mutate(rank
.order_by(_.mode, _.rank)
.select(
_.mode,
_.rank,
_.agency,
_.ntd_id,
_.city,
_.state,
_.type_of_service,
_.breakdown_rate,
_.vehicle_passenger_car_miles,
_.total_mechanical_failures,
)# NOTE: Data Wrangler can't view Ibis Tables.
# .to_pandas()
)
= (
biggest
stg.mutate(=ibis.row_number().over(
row_num=_.mode, order_by=_.vehicle_passenger_car_miles.desc()
group_by
)
)filter(_.row_num <= 10)
.
.mutate(=ibis.rank().over(group_by=_.mode, order_by=_.breakdown_rate.desc())
rank_biggest
)=_.rank_biggest + 1)
.mutate(rank_biggest
.order_by(_.mode, _.rank_biggest)
.select(s.all_of(keys), _.rank_biggest)# .to_pandas()
)
= (
last_year "../breakdowns_2022.csv")
con.read_csv("snake_case")
.rename(
.select(
s.all_of(keys),=_.rank,
rank_2022="rank,_10_biggest",
rank_2022_10_biggest
)# .to_pandas()
)
= (
clean
stg.left_join(biggest, keys)# NOTE: Ibis results contain "right" join keys, unlike dplyr and SQL.
"_right"))
.drop(s.endswith(=_.rank)
.relocate(_.rank_biggest, after
.left_join(last_year, keys)"_right"))
.drop(s.endswith(
.order_by(_.mode, _.rank)# .to_pandas()
)
Checks
# No duplicates.
# NOTE: Ibis IntegerScalars can't be compared with `==`. You must convert them to NumPy
# objects first with `.to_pandas()`.
assert clean.distinct(on=keys).count().to_pandas() == clean.count().to_pandas()
# NOTE: I miss R/RStudio's `View()`.
= (
check filter(_.rank_biggest.notnull())
clean.
.order_by(_.breakdown_rate.desc())
.to_pandas() )
Output
"../breakdowns_2023.csv") clean.to_csv(