Cleaning National Transit Data with Ibis

Or, Learn Ibis in Several Minutes

Python
Transit
Published

November 3, 2024

Every holiday season, the Federal Transit Administration gifts us a dataset: How much each transit agency’s vehicles broke down during the prior calendar year. It’s 2024. We now have 2023 data. Time to prove once again that New Jersey Transit isn’t the worst in the nation, and therefore everything is fine and you should all stop whining or voting.

But this year, I discovered Ibis, yet another Python library for working with data. If you paid attention, you know I’ve only used R–but exclusively its Tidyverse “dialect”–on this blog to date. I’ve been meaning to write about why I avoid Pandas (a.k.a. Base R for Python), or any other data analysis toolset on the Wall of Shame (base R, PySpark). But unlike those, Ibis is The Way. At long last I’m convinced to migrate away from my beloved Tidyverse + RStudio stack.

In this post I’ll demonstrate how to clean breakdown data from the FTA’s National Transit Database (NTD) using Ibis. I also recommend Visual Studio Code, its Data Wrangler extension, and Jupyter (only for interactive work), which together provide an RStudio-like experience.

While this stack has its hiccups (noted below)—Tidyverse + RStudio still provide the simplest, most ergonomic toolset for exploratory data analysis—Ibis is quite nice and ready for prime time. I bet it’ll one day overtake the others.

Stay tuned for my 2023 Transit Breakdown Rankings in a later post. Now on with the code.

import ibis
# Unlike R, Ibis doesn't have non-standard evaluation. Instaed we use this, which is
# probably better; all these years & I still don't understand closures in R.
from ibis import _
# This provides a suite of column selection helpers like tidyselect.
import ibis.selectors as s

# NOTE: By default, Ibis Tables print just the schema. This makes them print a preview
# of the data, like Data Frames in R or Pandas do.
ibis.options.interactive = True

# NOTE: The Polars backend can't do window functions. You'll get an error when calling
# `rank()` or `row_number()`.
con = ibis.duckdb.connect()
# Source: https://www.transit.dot.gov/ntd/data-product/2023-breakdowns
# Direct download: https://data.transportation.gov/resource/amkt-4ehs.csv?$query=SELECT%0A%20%20%60agency%60%2C%0A%20%20%60city%60%2C%0A%20%20%60state%60%2C%0A%20%20%60ntd_id%60%2C%0A%20%20%60organization_type%60%2C%0A%20%20%60reporter_type%60%2C%0A%20%20%60report_year%60%2C%0A%20%20%60uace_code%60%2C%0A%20%20%60uza_name%60%2C%0A%20%20%60primary_uza_population%60%2C%0A%20%20%60agency_voms%60%2C%0A%20%20%60mode%60%2C%0A%20%20%60mode_name%60%2C%0A%20%20%60type_of_service%60%2C%0A%20%20%60mode_voms%60%2C%0A%20%20%60major_mechanical_failures%60%2C%0A%20%20%60major_mechanical_failures_1%60%2C%0A%20%20%60other_mechanical_failures%60%2C%0A%20%20%60other_mechanical_failures_1%60%2C%0A%20%20%60total_mechanical_failures%60%2C%0A%20%20%60total_mechanical_failures_1%60%2C%0A%20%20%60vehicle_passenger_car_miles%60%2C%0A%20%20%60vehicle_passenger_car_miles_1%60%2C%0A%20%20%60vehicle_passenger_car_revenue%60%2C%0A%20%20%60vehicle_passenger_car_miles_2%60%2C%0A%20%20%60train_miles%60%2C%0A%20%20%60train_miles_questionable%60%2C%0A%20%20%60train_revenue_miles%60%2C%0A%20%20%60train_revenue_miles_1%60%0AWHERE%20caseless_one_of(%60report_year%60%2C%20%222023%22)
# The NTD for 2022 & onward are on the Federal gov't's decent open data website with a
# REST API that's actually nice to work with. However, the community-made Python API
# client, sodapy, is no longer maintained: https://pypi.org/project/sodapy/
# It still works, but it returns all fields as strings.
raw = con.read_csv("amkt-4ehs.csv")
raw.schema()
ibis.Schema {
  agency                         string
  city                           string
  state                          string
  ntd_id                         string
  organization_type              string
  reporter_type                  string
  report_year                    int64
  uace_code                      int64
  uza_name                       string
  primary_uza_population         int64
  agency_voms                    int64
  mode                           string
  mode_name                      string
  type_of_service                string
  mode_voms                      int64
  major_mechanical_failures      int64
  major_mechanical_failures_1    string
  other_mechanical_failures      int64
  other_mechanical_failures_1    string
  total_mechanical_failures      int64
  total_mechanical_failures_1    string
  vehicle_passenger_car_miles    int64
  vehicle_passenger_car_miles_1  string
  vehicle_passenger_car_revenue  int64
  vehicle_passenger_car_miles_2  string
  train_miles                    int64
  train_miles_questionable       string
  train_revenue_miles            int64
  train_revenue_miles_1          string
}

Remember: Each row is an agency-mode-service type. For example:

# So these are the primary keys.
keys = [
    "ntd_id",
    "mode",
    "type_of_service",
]
stg = (
    raw.filter(
        (
            _.mode.isin(["CR", "HR", "LR", "MB"]),
            _.total_mechanical_failures > 0,
            # NOTE: Ibis doesn't return nulls for != predicates, like dplyr or SQL do.
            (
                (_.total_mechanical_failures_1.isnull())
                | (_.total_mechanical_failures_1 != "Q")
            ),
            _.vehicle_passenger_car_miles > 0,
            (
                (_.vehicle_passenger_car_miles_2.isnull())
                | (_.vehicle_passenger_car_miles_2 != "Q")
            ),
        )
    )
    .group_by(_.mode)
    .mutate(
        agency = _.agency.re_replace(".*dba: ", ""),
        breakdown_rate=_.vehicle_passenger_car_miles / _.total_mechanical_failures
    )
    # NOTE: In Ibis can't create a column from another column created within the same
    # `mutate()` call, like dplyr can.
    .mutate(rank=ibis.rank().over(group_by=_.mode, order_by=_.breakdown_rate.desc()))
    # NOTE: Ranks start at 0!
    .mutate(rank=_.rank + 1)
    .order_by(_.mode, _.rank)
    .select(
        _.mode,
        _.rank,
        _.agency,
        _.ntd_id,
        _.city,
        _.state,
        _.type_of_service,
        _.breakdown_rate,
        _.vehicle_passenger_car_miles,
        _.total_mechanical_failures,
    )
    # NOTE: Data Wrangler can't view Ibis Tables.
    # .to_pandas()
)
biggest = (
    stg.mutate(
        row_num=ibis.row_number().over(
            group_by=_.mode, order_by=_.vehicle_passenger_car_miles.desc()
        )
    )
    .filter(_.row_num <= 10)
    .mutate(
        rank_biggest=ibis.rank().over(group_by=_.mode, order_by=_.breakdown_rate.desc())
    )
    .mutate(rank_biggest=_.rank_biggest + 1)
    .order_by(_.mode, _.rank_biggest)
    .select(s.all_of(keys), _.rank_biggest)
    # .to_pandas()
)
last_year = (
    con.read_csv("../breakdowns_2022.csv")
    .rename("snake_case")
    .select(
        s.all_of(keys),
        rank_2022=_.rank,
        rank_2022_10_biggest="rank,_10_biggest",
    )
    # .to_pandas()
)
clean = (
    stg.left_join(biggest, keys)
    # NOTE: Ibis results contain "right" join keys, unlike dplyr and SQL.
    .drop(s.endswith("_right"))
    .relocate(_.rank_biggest, after=_.rank)
    .left_join(last_year, keys)
    .drop(s.endswith("_right"))
    .order_by(_.mode, _.rank)
    # .to_pandas()
)

Checks

# No duplicates.
# NOTE: Ibis IntegerScalars can't be compared with `==`. You must convert them to NumPy
# objects first with `.to_pandas()`.
assert clean.distinct(on=keys).count().to_pandas() == clean.count().to_pandas()
# NOTE: I miss R/RStudio's `View()`.
check = (
    clean.filter(_.rank_biggest.notnull())
    .order_by(_.breakdown_rate.desc())
    .to_pandas()
)

Output

clean.to_csv("../breakdowns_2023.csv")