API Reference¶

The public API is organized around explicit namespaces rather than a flat root package.

The root nyc311 package is intentionally minimal and only exposes version metadata. Import functionality from the canonical public modules below.

nyc311.geographies is the one namespace that intentionally fronts another package: it preserves the 311-facing geography surface while delegating generic boundary loading and normalization primitives to nyc-geo-toolkit.

Update docstrings and exported symbols in src/nyc311/ rather than editing this reference structure by hand.

Root Package¶

nyc311 ¶

Minimal root namespace for the nyc311 package.

Models¶

nyc311.models ¶

Public typed models and constants for the nyc311 package.

BoundaryCollection `module-attribute` ¶

BoundaryCollection = BoundaryCollection

BoundaryFeature `module-attribute` ¶

BoundaryFeature = BoundaryFeature

BOROUGH_BRONX `module-attribute` ¶

BOROUGH_BRONX: Final[BoroughName] = 'BRONX'

BOROUGH_BROOKLYN `module-attribute` ¶

BOROUGH_BROOKLYN: Final[BoroughName] = 'BROOKLYN'

BOROUGH_MANHATTAN `module-attribute` ¶

BOROUGH_MANHATTAN: Final[BoroughName] = 'MANHATTAN'

BOROUGH_QUEENS `module-attribute` ¶

BOROUGH_QUEENS: Final[BoroughName] = 'QUEENS'

BOROUGH_STATEN_ISLAND `module-attribute` ¶

BOROUGH_STATEN_ISLAND: Final[BoroughName] = 'STATEN ISLAND'

SOCRATA_DATASET_IDENTIFIER `module-attribute` ¶

SOCRATA_DATASET_IDENTIFIER: Final[str] = 'erm2-nwe9'

SUPPORTED_BOROUGHS `module-attribute` ¶

SUPPORTED_BOROUGHS: Final[tuple[BoroughName, ...]] = (
    BOROUGH_BRONX,
    BOROUGH_BROOKLYN,
    BOROUGH_MANHATTAN,
    BOROUGH_QUEENS,
    BOROUGH_STATEN_ISLAND,
)

SUPPORTED_BOUNDARY_GEOGRAPHIES `module-attribute` ¶

SUPPORTED_BOUNDARY_GEOGRAPHIES: Final[tuple[str, ...]] = (
    "borough",
    "community_district",
    "council_district",
    "neighborhood_tabulation_area",
    "census_tract",
    "zcta",
)

SUPPORTED_GEOGRAPHIES `module-attribute` ¶

SUPPORTED_GEOGRAPHIES: Final[tuple[str, ...]] = (
    SUPPORTED_RECORD_GEOGRAPHIES
)

SUPPORTED_RECORD_GEOGRAPHIES `module-attribute` ¶

SUPPORTED_RECORD_GEOGRAPHIES: Final[tuple[str, ...]] = (
    "borough",
    "community_district",
)

BoroughName `module-attribute` ¶

BoroughName = str

AnalysisWindow `dataclass` ¶

Rolling time window used for trend and anomaly calculations.

Source code in src/nyc311/models/_analysis.py

@dataclass(frozen=True, slots=True)
class AnalysisWindow:
    """Rolling time window used for trend and anomaly calculations."""

    days: int

    def __post_init__(self) -> None:
        if self.days < 1:
            raise ValueError("days must be at least 1.")

days `instance-attribute` ¶

days: int

AnomalyResult `dataclass` ¶

A standardized anomaly score for one aggregated topic summary.

Source code in src/nyc311/models/_analysis.py

@dataclass(frozen=True, slots=True)
class AnomalyResult:
    """A standardized anomaly score for one aggregated topic summary."""

    geography: str
    geography_value: str
    complaint_type: str
    topic: str
    complaint_count: int
    geography_total_count: int
    share_of_geography: float
    topic_rank: int
    z_score: float
    is_anomaly: bool
    window_days: int
    anomaly_threshold: float

    def __post_init__(self) -> None:
        normalized_geography = self.geography.strip().lower()
        if normalized_geography not in SUPPORTED_GEOGRAPHIES:
            msg = (
                "Unsupported anomaly geography. "
                f"Expected one of {SUPPORTED_GEOGRAPHIES}, got {self.geography!r}."
            )
            raise ValueError(msg)
        if not _normalize_value(self.geography_value):
            raise ValueError("geography_value must not be empty.")
        if not _normalize_value(self.complaint_type):
            raise ValueError("complaint_type must not be empty.")
        if not _normalize_value(self.topic):
            raise ValueError("topic must not be empty.")
        if self.complaint_count < 1:
            raise ValueError("complaint_count must be at least 1.")
        if self.geography_total_count < self.complaint_count:
            raise ValueError("geography_total_count must be >= complaint_count.")
        if not 0 < self.share_of_geography <= 1:
            raise ValueError("share_of_geography must be in the interval (0, 1].")
        if self.topic_rank < 1:
            raise ValueError("topic_rank must be at least 1.")
        if self.window_days < 1:
            raise ValueError("window_days must be at least 1.")
        if self.anomaly_threshold <= 0:
            raise ValueError("anomaly_threshold must be positive.")

        object.__setattr__(self, "geography", normalized_geography)
        object.__setattr__(
            self, "geography_value", _normalize_value(self.geography_value)
        )
        object.__setattr__(
            self, "complaint_type", _normalize_value(self.complaint_type)
        )
        object.__setattr__(self, "topic", _normalize_value(self.topic))

geography `instance-attribute` ¶

geography: str

geography_value `instance-attribute` ¶

geography_value: str

complaint_type `instance-attribute` ¶

complaint_type: str

topic `instance-attribute` ¶

topic: str

complaint_count `instance-attribute` ¶

complaint_count: int

geography_total_count `instance-attribute` ¶

geography_total_count: int

share_of_geography `instance-attribute` ¶

share_of_geography: float

topic_rank `instance-attribute` ¶

topic_rank: int

z_score `instance-attribute` ¶

z_score: float

is_anomaly `instance-attribute` ¶

is_anomaly: bool

window_days `instance-attribute` ¶

window_days: int

anomaly_threshold `instance-attribute` ¶

anomaly_threshold: float

ExportTarget `dataclass` ¶

Destination metadata for supported exporters.

Source code in src/nyc311/models/_analysis.py

@dataclass(frozen=True, slots=True)
class ExportTarget:
    """Destination metadata for supported exporters."""

    format: str
    output_path: Path

    def __post_init__(self) -> None:
        normalized_format = self.format.strip().lower()
        if not normalized_format:
            raise ValueError("format must not be empty.")
        object.__setattr__(self, "format", normalized_format)
        object.__setattr__(self, "output_path", Path(self.output_path))

format `instance-attribute` ¶

format: str

output_path `instance-attribute` ¶

output_path: Path

GeographyTopicSummary `dataclass` ¶

An export-ready summary row for topic counts within one geography.

Source code in src/nyc311/models/_analysis.py

@dataclass(frozen=True, slots=True)
class GeographyTopicSummary:
    """An export-ready summary row for topic counts within one geography."""

    geography: str
    geography_value: str
    complaint_type: str
    topic: str
    complaint_count: int
    geography_total_count: int
    share_of_geography: float
    topic_rank: int
    is_dominant_topic: bool

    def __post_init__(self) -> None:
        normalized_geography = self.geography.strip().lower()
        if normalized_geography not in SUPPORTED_GEOGRAPHIES:
            msg = (
                "Unsupported geography summary. "
                f"Expected one of {SUPPORTED_GEOGRAPHIES}, got {self.geography!r}."
            )
            raise ValueError(msg)
        if self.complaint_count < 1:
            raise ValueError("complaint_count must be at least 1.")
        if self.geography_total_count < self.complaint_count:
            raise ValueError("geography_total_count must be >= complaint_count.")
        if not 0 < self.share_of_geography <= 1:
            raise ValueError("share_of_geography must be in the interval (0, 1].")
        if self.topic_rank < 1:
            raise ValueError("topic_rank must be at least 1.")
        if not _normalize_value(self.geography_value):
            raise ValueError("geography_value must not be empty.")
        if not _normalize_value(self.complaint_type):
            raise ValueError("complaint_type must not be empty.")
        if not _normalize_value(self.topic):
            raise ValueError("topic must not be empty.")

        object.__setattr__(self, "geography", normalized_geography)
        object.__setattr__(
            self, "geography_value", _normalize_value(self.geography_value)
        )
        object.__setattr__(
            self, "complaint_type", _normalize_value(self.complaint_type)
        )
        object.__setattr__(self, "topic", _normalize_value(self.topic))

geography `instance-attribute` ¶

geography: str

geography_value `instance-attribute` ¶

geography_value: str

complaint_type `instance-attribute` ¶

complaint_type: str

topic `instance-attribute` ¶

topic: str

complaint_count `instance-attribute` ¶

complaint_count: int

geography_total_count `instance-attribute` ¶

geography_total_count: int

share_of_geography `instance-attribute` ¶

share_of_geography: float

topic_rank `instance-attribute` ¶

topic_rank: int

is_dominant_topic `instance-attribute` ¶

is_dominant_topic: bool

ResolutionGapSummary `dataclass` ¶

A first-pass borough-level summary of unresolved complaint volume.

Source code in src/nyc311/models/_analysis.py

@dataclass(frozen=True, slots=True)
class ResolutionGapSummary:
    """A first-pass borough-level summary of unresolved complaint volume."""

    geography: str
    geography_value: str
    complaint_type: str
    total_request_count: int
    resolved_request_count: int
    unresolved_request_count: int
    unresolved_share: float
    resolution_rate: float

    def __post_init__(self) -> None:
        normalized_geography = self.geography.strip().lower()
        if normalized_geography not in SUPPORTED_GEOGRAPHIES:
            msg = (
                "Unsupported geography summary. "
                f"Expected one of {SUPPORTED_GEOGRAPHIES}, got {self.geography!r}."
            )
            raise ValueError(msg)
        if self.total_request_count < 1:
            raise ValueError("total_request_count must be at least 1.")
        if self.resolved_request_count < 0 or self.unresolved_request_count < 0:
            raise ValueError("resolution counts must be non-negative.")
        if (
            self.resolved_request_count + self.unresolved_request_count
            != self.total_request_count
        ):
            raise ValueError(
                "resolved_request_count + unresolved_request_count must equal total_request_count."
            )
        if not 0 <= self.unresolved_share <= 1:
            raise ValueError("unresolved_share must be in the interval [0, 1].")
        if not 0 <= self.resolution_rate <= 1:
            raise ValueError("resolution_rate must be in the interval [0, 1].")
        if not _normalize_value(self.geography_value):
            raise ValueError("geography_value must not be empty.")
        if not _normalize_value(self.complaint_type):
            raise ValueError("complaint_type must not be empty.")

        object.__setattr__(self, "geography", normalized_geography)
        object.__setattr__(
            self, "geography_value", _normalize_value(self.geography_value)
        )
        object.__setattr__(
            self, "complaint_type", _normalize_value(self.complaint_type)
        )

geography `instance-attribute` ¶

geography: str

geography_value `instance-attribute` ¶

geography_value: str

complaint_type `instance-attribute` ¶

complaint_type: str

total_request_count `instance-attribute` ¶

total_request_count: int

resolved_request_count `instance-attribute` ¶

resolved_request_count: int

unresolved_request_count `instance-attribute` ¶

unresolved_request_count: int

unresolved_share `instance-attribute` ¶

unresolved_share: float

resolution_rate `instance-attribute` ¶

resolution_rate: float

TopicCoverageReport `dataclass` ¶

Coverage metadata that shows how much a topic ruleset matched.

Source code in src/nyc311/models/_analysis.py

@dataclass(frozen=True, slots=True)
class TopicCoverageReport:
    """Coverage metadata that shows how much a topic ruleset matched."""

    complaint_type: str
    total_records: int
    matched_records: int
    other_records: int
    coverage_rate: float
    top_unmatched_descriptors: tuple[tuple[str, int], ...]

    def __post_init__(self) -> None:
        if not _normalize_value(self.complaint_type):
            raise ValueError("complaint_type must not be empty.")
        if self.total_records < 0:
            raise ValueError("total_records must be non-negative.")
        if self.matched_records < 0:
            raise ValueError("matched_records must be non-negative.")
        if self.other_records < 0:
            raise ValueError("other_records must be non-negative.")
        if self.matched_records + self.other_records != self.total_records:
            raise ValueError(
                "matched_records + other_records must equal total_records."
            )
        if not 0 <= self.coverage_rate <= 1:
            raise ValueError("coverage_rate must be in the interval [0, 1].")
        object.__setattr__(
            self, "complaint_type", _normalize_value(self.complaint_type)
        )

complaint_type `instance-attribute` ¶

complaint_type: str

total_records `instance-attribute` ¶

total_records: int

matched_records `instance-attribute` ¶

matched_records: int

other_records `instance-attribute` ¶

other_records: int

coverage_rate `instance-attribute` ¶

coverage_rate: float

top_unmatched_descriptors `instance-attribute` ¶

top_unmatched_descriptors: tuple[tuple[str, int], ...]

TopicQuery `dataclass` ¶

Topic-analysis parameters for the implemented rules-based workflow.

Source code in src/nyc311/models/_analysis.py

@dataclass(frozen=True, slots=True)
class TopicQuery:
    """Topic-analysis parameters for the implemented rules-based workflow."""

    complaint_type: str
    top_n: int = 20

    def __post_init__(self) -> None:
        normalized_complaint_type = _normalize_value(self.complaint_type)
        if not normalized_complaint_type:
            raise ValueError("complaint_type must not be empty.")
        if self.top_n < 1:
            raise ValueError("top_n must be at least 1.")
        object.__setattr__(self, "complaint_type", normalized_complaint_type)

complaint_type `instance-attribute` ¶

complaint_type: str

top_n `class-attribute` `instance-attribute` ¶

top_n: int = 20

BoundaryGeoJSONExport `dataclass` ¶

Combined boundary + summary payload for GeoJSON export.

Source code in src/nyc311/models/_boundaries.py

@dataclass(frozen=True, slots=True)
class BoundaryGeoJSONExport:
    """Combined boundary + summary payload for GeoJSON export."""

    boundaries: BoundaryCollection
    summaries: tuple[GeographyTopicSummary, ...]

boundaries `instance-attribute` ¶

boundaries: BoundaryCollection

summaries `instance-attribute` ¶

summaries: tuple[GeographyTopicSummary, ...]

GeographyFilter `dataclass` ¶

A supported geography selector for implemented loading filters.

Source code in src/nyc311/models/_filters.py

@dataclass(frozen=True, slots=True)
class GeographyFilter:
    """A supported geography selector for implemented loading filters."""

    geography: str
    value: str

    def __post_init__(self) -> None:
        normalized_geography = self.geography.strip().lower()
        normalized_value = (
            normalize_borough_name(self.value)
            if normalized_geography == "borough"
            else _normalize_value(self.value)
        )

        if normalized_geography not in SUPPORTED_GEOGRAPHIES:
            msg = (
                "Unsupported geography filter. "
                f"Expected one of {SUPPORTED_GEOGRAPHIES}, got {self.geography!r}."
            )
            raise ValueError(msg)
        if not normalized_value:
            raise ValueError("Geography filter value must not be empty.")

        object.__setattr__(self, "geography", normalized_geography)
        object.__setattr__(self, "value", normalized_value)

geography `instance-attribute` ¶

geography: str

value `instance-attribute` ¶

value: str

ServiceRequestFilter `dataclass` ¶

Filters for CSV and Socrata service-request loading.

Source code in src/nyc311/models/_filters.py

@dataclass(frozen=True, slots=True)
class ServiceRequestFilter:
    """Filters for CSV and Socrata service-request loading."""

    start_date: date | None = None
    end_date: date | None = None
    geography: GeographyFilter | None = None
    complaint_types: tuple[str, ...] = ()

    def __post_init__(self) -> None:
        if self.start_date and self.end_date and self.start_date > self.end_date:
            raise ValueError("start_date must be on or before end_date.")

        normalized_complaint_types = tuple(
            normalized
            for raw_value in self.complaint_types
            if (normalized := _normalize_value(raw_value))
        )
        object.__setattr__(self, "complaint_types", normalized_complaint_types)

start_date `class-attribute` `instance-attribute` ¶

start_date: date | None = None

end_date `class-attribute` `instance-attribute` ¶

end_date: date | None = None

geography `class-attribute` `instance-attribute` ¶

geography: GeographyFilter | None = None

complaint_types `class-attribute` `instance-attribute` ¶

complaint_types: tuple[str, ...] = ()

SocrataConfig `dataclass` ¶

Configuration for the implemented live Socrata loader path.

extra_where_clauses holds additional $where fragments (Socrata SoQL) that are AND-joined after the predicates derived from :class:ServiceRequestFilter. Use for predicates not covered by the filter (e.g. latitude IS NOT NULL). Values are stripped; empty strings are dropped.

Source code in src/nyc311/models/_filters.py

@dataclass(frozen=True, slots=True)
class SocrataConfig:
    """Configuration for the implemented live Socrata loader path.

    ``extra_where_clauses`` holds additional ``$where`` fragments (Socrata SoQL) that
    are AND-joined after the predicates derived from :class:`ServiceRequestFilter`.
    Use for predicates not covered by the filter (e.g. ``latitude IS NOT NULL``).
    Values are stripped; empty strings are dropped.
    """

    dataset_identifier: str = SOCRATA_DATASET_IDENTIFIER
    base_url: str = "https://data.cityofnewyork.us/resource"
    app_token: str | None = None
    page_size: int = 1000
    request_timeout_seconds: float = 30.0
    max_pages: int | None = None
    created_date_sort: Literal["asc", "desc"] = "asc"
    extra_where_clauses: tuple[str, ...] = field(default_factory=tuple)

    def __post_init__(self) -> None:
        dataset_identifier = self.dataset_identifier.strip()
        base_url = self.base_url.rstrip("/")

        if not dataset_identifier:
            raise ValueError("dataset_identifier must not be empty.")
        if not base_url:
            raise ValueError("base_url must not be empty.")
        if self.page_size < 1:
            raise ValueError("page_size must be at least 1.")
        if self.request_timeout_seconds <= 0:
            raise ValueError("request_timeout_seconds must be positive.")
        if self.max_pages is not None and self.max_pages < 1:
            raise ValueError("max_pages must be at least 1 when provided.")
        if self.created_date_sort not in ("asc", "desc"):
            raise ValueError("created_date_sort must be 'asc' or 'desc'.")

        normalized_extra_where_clauses = tuple(
            normalized
            for raw_value in self.extra_where_clauses
            if (normalized := _normalize_value(raw_value))
        )
        object.__setattr__(self, "dataset_identifier", dataset_identifier)
        object.__setattr__(self, "base_url", base_url)
        object.__setattr__(self, "extra_where_clauses", normalized_extra_where_clauses)

dataset_identifier `class-attribute` `instance-attribute` ¶

dataset_identifier: str = SOCRATA_DATASET_IDENTIFIER

base_url `class-attribute` `instance-attribute` ¶

base_url: str = 'https://data.cityofnewyork.us/resource'

app_token `class-attribute` `instance-attribute` ¶

app_token: str | None = None

page_size `class-attribute` `instance-attribute` ¶

page_size: int = 1000

request_timeout_seconds `class-attribute` `instance-attribute` ¶

request_timeout_seconds: float = 30.0

max_pages `class-attribute` `instance-attribute` ¶

max_pages: int | None = None

created_date_sort `class-attribute` `instance-attribute` ¶

created_date_sort: Literal['asc', 'desc'] = 'asc'

extra_where_clauses `class-attribute` `instance-attribute` ¶

extra_where_clauses: tuple[str, ...] = field(
    default_factory=tuple
)

ServiceRequestRecord `dataclass` ¶

A single loaded NYC 311-style service-request record.

.. note::

As of nyc311 v1.0.1, ``closed_date`` is carried alongside
``created_date`` so resolution-time analyses don't have to
bypass the SDK. The field is optional — Socrata returns a
null ``closed_date`` for any unresolved complaint — and
existing call sites that instantiate the record without it
keep working unchanged.

.. note::

As of nyc311 v1.0.4, ``created_at`` / ``closed_at`` carry the
full-precision timestamps whenever the source string includes a
time component (Socrata always provides one for this dataset).
The day-grain ``created_date`` / ``closed_date`` fields stay
authoritative for day-grain analyses and remain
backward-compatible; use ``closed_at - created_at`` when a
resolution-time analysis needs hour precision.

Source code in src/nyc311/models/_records.py

@dataclass(frozen=True, slots=True)
class ServiceRequestRecord:
    """A single loaded NYC 311-style service-request record.

    .. note::

        As of nyc311 v1.0.1, ``closed_date`` is carried alongside
        ``created_date`` so resolution-time analyses don't have to
        bypass the SDK. The field is optional — Socrata returns a
        null ``closed_date`` for any unresolved complaint — and
        existing call sites that instantiate the record without it
        keep working unchanged.

    .. note::

        As of nyc311 v1.0.4, ``created_at`` / ``closed_at`` carry the
        full-precision timestamps whenever the source string includes a
        time component (Socrata always provides one for this dataset).
        The day-grain ``created_date`` / ``closed_date`` fields stay
        authoritative for day-grain analyses and remain
        backward-compatible; use ``closed_at - created_at`` when a
        resolution-time analysis needs hour precision.
    """

    service_request_id: str
    created_date: date
    complaint_type: str
    descriptor: str
    borough: str
    community_district: str
    resolution_description: str | None = None
    latitude: float | None = None
    longitude: float | None = None
    #: Date the complaint was closed. ``None`` for unresolved
    #: complaints. Use ``closed_date - created_date`` for resolution
    #: latency in days.
    closed_date: date | None = None
    #: Full-precision creation timestamp when the source provides a
    #: time component; ``None`` when only a bare date was available.
    #: ``created_date`` stays authoritative for day-grain analyses.
    created_at: datetime | None = None
    #: Full-precision closure timestamp when the source provides a
    #: time component; ``None`` for unresolved complaints or when only
    #: a bare date was available. Use ``closed_at - created_at`` for
    #: resolution latency at hour precision.
    closed_at: datetime | None = None

    def __post_init__(self) -> None:
        if not _normalize_value(self.service_request_id):
            raise ValueError("service_request_id must not be empty.")
        if not _normalize_value(self.complaint_type):
            raise ValueError("complaint_type must not be empty.")
        if not _normalize_value(self.borough):
            raise ValueError("borough must not be empty.")
        if not _normalize_value(self.community_district):
            raise ValueError("community_district must not be empty.")

        object.__setattr__(
            self, "service_request_id", _normalize_value(self.service_request_id)
        )
        object.__setattr__(
            self, "complaint_type", _normalize_value(self.complaint_type)
        )
        object.__setattr__(self, "descriptor", _normalize_value(self.descriptor))
        object.__setattr__(
            self, "borough", _normalize_borough_or_passthrough(self.borough)
        )
        object.__setattr__(
            self,
            "community_district",
            _normalize_community_district_or_passthrough(self.community_district),
        )

        normalized_resolution = (
            None
            if self.resolution_description is None
            else _normalize_value(self.resolution_description)
        )
        object.__setattr__(
            self,
            "resolution_description",
            normalized_resolution if normalized_resolution else None,
        )

        latitude, longitude = _normalize_coordinate_pair(
            self.latitude,
            self.longitude,
        )
        object.__setattr__(self, "latitude", latitude)
        object.__setattr__(self, "longitude", longitude)

    def geography_value(self, geography: str) -> str:
        """Return the value for a supported geography key."""
        normalized_geography = geography.strip().lower()
        if normalized_geography == "borough":
            return self.borough
        if normalized_geography == "community_district":
            return self.community_district
        msg = (
            "Unsupported aggregation geography. "
            f"Expected one of {SUPPORTED_GEOGRAPHIES}, got {geography!r}."
        )
        raise ValueError(msg)

service_request_id `instance-attribute` ¶

service_request_id: str

created_date `instance-attribute` ¶

created_date: date

complaint_type `instance-attribute` ¶

complaint_type: str

descriptor `instance-attribute` ¶

descriptor: str

borough `instance-attribute` ¶

borough: str

community_district `instance-attribute` ¶

community_district: str

resolution_description `class-attribute` `instance-attribute` ¶

resolution_description: str | None = None

latitude `class-attribute` `instance-attribute` ¶

latitude: float | None = None

longitude `class-attribute` `instance-attribute` ¶

longitude: float | None = None

closed_date `class-attribute` `instance-attribute` ¶

closed_date: date | None = None

created_at `class-attribute` `instance-attribute` ¶

created_at: datetime | None = None

closed_at `class-attribute` `instance-attribute` ¶

closed_at: datetime | None = None

geography_value ¶

geography_value(geography: str) -> str

Return the value for a supported geography key.

Source code in src/nyc311/models/_records.py

def geography_value(self, geography: str) -> str:
    """Return the value for a supported geography key."""
    normalized_geography = geography.strip().lower()
    if normalized_geography == "borough":
        return self.borough
    if normalized_geography == "community_district":
        return self.community_district
    msg = (
        "Unsupported aggregation geography. "
        f"Expected one of {SUPPORTED_GEOGRAPHIES}, got {geography!r}."
    )
    raise ValueError(msg)

TopicAssignment `dataclass` ¶

A deterministic topic label derived from one service-request record.

Source code in src/nyc311/models/_records.py

@dataclass(frozen=True, slots=True)
class TopicAssignment:
    """A deterministic topic label derived from one service-request record."""

    record: ServiceRequestRecord
    topic: str
    normalized_text: str

    def __post_init__(self) -> None:
        if not _normalize_value(self.topic):
            raise ValueError("topic must not be empty.")
        if not _normalize_value(self.normalized_text):
            raise ValueError("normalized_text must not be empty.")

        object.__setattr__(self, "topic", _normalize_value(self.topic))
        object.__setattr__(
            self, "normalized_text", _normalize_value(self.normalized_text)
        )

record `instance-attribute` ¶

record: ServiceRequestRecord

topic `instance-attribute` ¶

topic: str

normalized_text `instance-attribute` ¶

normalized_text: str

supported_topic_queries ¶

supported_topic_queries() -> tuple[str, ...]

Return the complaint types with implemented topic extraction.

Source code in src/nyc311/models/_constants.py

def supported_topic_queries() -> tuple[str, ...]:
    """Return the complaint types with implemented topic extraction."""
    return _SUPPORTED_TOPIC_QUERIES

normalize_borough_name ¶

normalize_borough_name(value: str) -> str

Normalize a borough name or borough alias to the canonical public constant.

Source code in src/nyc311/models/_normalize.py

def normalize_borough_name(value: str) -> str:
    """Normalize a borough name or borough alias to the canonical public constant."""
    normalized = _normalize_borough_or_passthrough(value)
    if normalized not in SUPPORTED_BOROUGHS:
        raise ValueError(
            "Unsupported borough name. "
            f"Expected one of {SUPPORTED_BOROUGHS}, got {value!r}."
        )
    return normalized

IO¶

nyc311.io ¶

Public loading helpers for service-request data.

REQUIRED_SERVICE_REQUEST_COLUMNS `module-attribute` ¶

REQUIRED_SERVICE_REQUEST_COLUMNS: Final[tuple[str, ...]] = (
    SERVICE_REQUEST_CSV_COLUMNS
)

cache_path_for_request ¶

cache_path_for_request(
    socrata_config: SocrataConfig,
    filters: ServiceRequestFilter,
    cache_dir: Path,
) -> Path

Return the deterministic CSV path for a Socrata config + filter pair.

Source code in src/nyc311/io/_cache.py

def cache_path_for_request(
    socrata_config: SocrataConfig,
    filters: ServiceRequestFilter,
    cache_dir: Path,
) -> Path:
    """Return the deterministic CSV path for a Socrata config + filter pair."""
    start = filters.start_date.isoformat() if filters.start_date else "none"
    end = filters.end_date.isoformat() if filters.end_date else "none"
    page = socrata_config.page_size
    sort_suffix = "_desc" if socrata_config.created_date_sort == "desc" else ""

    if filters.geography is None and not filters.complaint_types:
        name = f"all_{start}_{end}_{page}{sort_suffix}.csv"
        return cache_dir / name

    borough = "all"
    if filters.geography is not None and filters.geography.geography == "borough":
        borough = _slug(filters.geography.value)

    complaint_types = filters.complaint_types
    if not complaint_types:
        ct_slug = "all"
    elif len(complaint_types) == 1:
        ct_slug = _slug(complaint_types[0])
    else:
        joined = "_".join(sorted(_slug(c) for c in complaint_types))
        ct_slug = joined[:120]

    name = f"{borough}_{ct_slug}_{start}_{end}_{page}{sort_suffix}.csv"
    return cache_dir / name

cached_fetch ¶

cached_fetch(
    socrata_config: SocrataConfig,
    filters: ServiceRequestFilter,
    *,
    cache_dir: Path,
    refresh: bool = False,
    request_open: Callable[..., Any] | None = None,
    max_records: int | None = None,
    on_page: Callable[[int, int], None] | None = None,
) -> Path

Stream a Socrata query to a CSV file under cache_dir; return the path.

Skips the network fetch when the file already exists and refresh is False. Rows are filtered with the same rules as :func:load_service_requests_from_socrata.

For multi-gigabyte extracts, prefer this function and analyze with chunked pandas.read_csv instead of loading via :func:load_service_requests, which materializes rows in memory.

Optional on_page is forwarded to :func:nyc311.io.iter_service_requests_from_socrata for per-HTTP-page progress (page index and row count for that page).

Source code in src/nyc311/io/_cache.py

def cached_fetch(
    socrata_config: SocrataConfig,
    filters: ServiceRequestFilter,
    *,
    cache_dir: Path,
    refresh: bool = False,
    request_open: Callable[..., Any] | None = None,
    max_records: int | None = None,
    on_page: Callable[[int, int], None] | None = None,
) -> Path:
    """Stream a Socrata query to a CSV file under ``cache_dir``; return the path.

    Skips the network fetch when the file already exists and ``refresh`` is False.
    Rows are filtered with the same rules as :func:`load_service_requests_from_socrata`.

    For multi-gigabyte extracts, prefer this function and analyze with chunked
    ``pandas.read_csv`` instead of loading via :func:`load_service_requests`, which
    materializes rows in memory.

    Optional ``on_page`` is forwarded to :func:`nyc311.io.iter_service_requests_from_socrata`
    for per-HTTP-page progress (page index and row count for that page).
    """
    opener = urlopen if request_open is None else request_open
    cache_dir = Path(cache_dir)
    cache_dir.mkdir(parents=True, exist_ok=True)
    output_path = cache_path_for_request(socrata_config, filters, cache_dir)
    partial_path = _partial_cache_path(output_path)

    if output_path.is_file() and not refresh:
        return output_path

    if refresh:
        if output_path.is_file():
            output_path.unlink()
        if partial_path.is_file():
            partial_path.unlink()
    elif partial_path.is_file() and not output_path.is_file():
        # Interrupted previous run left a partial file; do not treat as complete.
        partial_path.unlink()

    written = 0
    try:
        with partial_path.open("w", newline="", encoding="utf-8") as csv_file:
            writer = csv.DictWriter(csv_file, fieldnames=SERVICE_REQUEST_EXPORT_COLUMNS)
            writer.writeheader()
            for record in iter_service_requests_from_socrata(
                socrata_config,
                filters=filters,
                request_open=opener,
                on_page=on_page,
            ):
                if not record_matches_service_request_filter(record, filters):
                    continue
                _write_record_row(writer, record)
                written += 1
                if max_records is not None and written >= max_records:
                    break
        partial_path.replace(output_path)
    except BaseException:
        if partial_path.is_file():
            partial_path.unlink()
        raise

    _write_meta(output_path, written, socrata_config, filters)
    return output_path

load_service_requests_from_csv ¶

load_service_requests_from_csv(
    source: str | Path, *, filters: ServiceRequestFilter
) -> list[ServiceRequestRecord]

Load and filter service-request records from a local CSV snapshot.

Source code in src/nyc311/io/_csv.py

def load_service_requests_from_csv(
    source: str | Path,
    *,
    filters: ServiceRequestFilter,
) -> list[ServiceRequestRecord]:
    """Load and filter service-request records from a local CSV snapshot."""
    source_path = Path(source)
    with source_path.open(newline="", encoding="utf-8") as csv_file:
        reader = csv.DictReader(csv_file)
        fieldnames = reader.fieldnames
        if fieldnames is None:
            raise ValueError("CSV file must include a header row.")

        community_district_column = _validate_columns(fieldnames)
        loaded_records = [
            _record_from_mapping(row, community_district_column) for row in reader
        ]

    return _apply_filters(loaded_records, filters)

load_resolution_data ¶

load_resolution_data(
    source: str | Path | SocrataConfig,
    *,
    filters: ServiceRequestFilter | None = None,
    cache_dir: Path | str | None = None,
    refresh: bool = False,
    max_cached_records: int | None = None,
) -> list[ServiceRequestRecord]

Load the subset of service requests that already include resolution text.

Source code in src/nyc311/io/_service_requests.py

def load_resolution_data(
    source: str | Path | SocrataConfig,
    *,
    filters: ServiceRequestFilter | None = None,
    cache_dir: Path | str | None = None,
    refresh: bool = False,
    max_cached_records: int | None = None,
) -> list[ServiceRequestRecord]:
    """Load the subset of service requests that already include resolution text."""
    loaded_records = load_service_requests(
        source,
        filters=filters,
        cache_dir=cache_dir,
        refresh=refresh,
        max_cached_records=max_cached_records,
    )
    return [
        record for record in loaded_records if record.resolution_description is not None
    ]

load_service_requests ¶

load_service_requests(
    source: str | Path | SocrataConfig,
    *,
    filters: ServiceRequestFilter | None = None,
    cache_dir: Path | str | None = None,
    refresh: bool = False,
    max_cached_records: int | None = None,
) -> list[ServiceRequestRecord]

Load filtered NYC 311-style service-request records from CSV or Socrata.

When source is a :class:~nyc311.models.SocrataConfig and cache_dir is set, the live API response is streamed to a deterministic CSV under cache_dir (see :func:cached_fetch), then loaded from disk. Very large extracts should use :func:cached_fetch with chunked pandas analysis instead of this helper, which returns an in-memory list.

Source code in src/nyc311/io/_service_requests.py

def load_service_requests(
    source: str | Path | SocrataConfig,
    *,
    filters: ServiceRequestFilter | None = None,
    cache_dir: Path | str | None = None,
    refresh: bool = False,
    max_cached_records: int | None = None,
) -> list[ServiceRequestRecord]:
    """Load filtered NYC 311-style service-request records from CSV or Socrata.

    When ``source`` is a :class:`~nyc311.models.SocrataConfig` and ``cache_dir``
    is set, the live API response is streamed to a deterministic CSV under
    ``cache_dir`` (see :func:`cached_fetch`), then loaded from disk. Very large
    extracts should use :func:`cached_fetch` with chunked pandas analysis instead
    of this helper, which returns an in-memory list.
    """
    service_request_filter = filters or ServiceRequestFilter()
    if isinstance(source, SocrataConfig):
        if cache_dir is not None:
            cache_path = Path(cache_dir)
            csv_path = cached_fetch(
                source,
                service_request_filter,
                cache_dir=cache_path,
                refresh=refresh,
                request_open=urlopen,
                max_records=max_cached_records,
            )
            return load_service_requests_from_csv(
                csv_path, filters=service_request_filter
            )
        return load_service_requests_from_socrata(
            source,
            filters=service_request_filter,
            request_open=urlopen,
        )

    return load_service_requests_from_csv(source, filters=service_request_filter)

iter_service_requests_from_socrata ¶

iter_service_requests_from_socrata(
    socrata_config: SocrataConfig,
    *,
    filters: ServiceRequestFilter,
    request_open: Callable[..., Any],
    on_page: Callable[[int, int], None] | None = None,
) -> Iterator[ServiceRequestRecord]

Yield service-request records from Socrata without holding all pages in memory.

on_page is invoked after each successful HTTP response with (page_index, row_count_in_page) (0-based page index).

Source code in src/nyc311/io/_socrata.py

def iter_service_requests_from_socrata(
    socrata_config: SocrataConfig,
    *,
    filters: ServiceRequestFilter,
    request_open: Callable[..., Any],
    on_page: Callable[[int, int], None] | None = None,
) -> Iterator[ServiceRequestRecord]:
    """Yield service-request records from Socrata without holding all pages in memory.

    ``on_page`` is invoked after each successful HTTP response with
    ``(page_index, row_count_in_page)`` (0-based page index).
    """
    headers = {"Accept": "application/json"}
    if socrata_config.app_token is not None:
        headers["X-App-Token"] = socrata_config.app_token

    request_limit = socrata_config.page_size
    offset = 0
    page_count = 0

    while True:
        if (
            socrata_config.max_pages is not None
            and page_count >= socrata_config.max_pages
        ):
            break

        request_url = _build_socrata_url(socrata_config, filters, offset=offset)
        request = Request(request_url, headers=headers)
        payload = _fetch_socrata_page_json(
            request,
            request_open=request_open,
            timeout=socrata_config.request_timeout_seconds,
        )

        if on_page is not None:
            on_page(page_count, len(payload))

        if not payload:
            break

        for raw_row in payload:
            if not isinstance(raw_row, dict):
                raise ValueError(
                    "Unexpected Socrata response row; expected a JSON object."
                )
            normalized_row = _normalize_socrata_row(raw_row)
            community_district_column = (
                "community_district"
                if "community_district" in normalized_row
                else "community_board"
            )
            yield _record_from_mapping(normalized_row, community_district_column)

        if len(payload) < request_limit:
            break
        offset += request_limit
        page_count += 1

load_service_requests_from_socrata ¶

load_service_requests_from_socrata(
    socrata_config: SocrataConfig,
    *,
    filters: ServiceRequestFilter,
    request_open: Callable[..., Any],
) -> list[ServiceRequestRecord]

Load and filter service-request records from the live Socrata API.

Source code in src/nyc311/io/_socrata.py

def load_service_requests_from_socrata(
    socrata_config: SocrataConfig,
    *,
    filters: ServiceRequestFilter,
    request_open: Callable[..., Any],
) -> list[ServiceRequestRecord]:
    """Load and filter service-request records from the live Socrata API."""
    records = list(
        iter_service_requests_from_socrata(
            socrata_config, filters=filters, request_open=request_open
        )
    )
    return _apply_filters(records, filters)

Analysis¶

nyc311.analysis ¶

Public analysis helpers for nyc311 complaint workflows.

DEFAULT_TOPIC_RULES `module-attribute` ¶

DEFAULT_TOPIC_RULES: Final[dict[str, TopicRuleSet]] = {
    "Noise - Residential": (
        (
            "party_music",
            (
                "party",
                "music",
                "speakers",
                "stereo",
                "bass",
                "television",
            ),
        ),
        (
            "construction",
            ("construction", "drilling", "jackhammer"),
        ),
        ("pet_noise", ("dog", "barking", "pet")),
        (
            "banging",
            (
                "banging",
                "thumping",
                "shaking",
                "arguing",
                "hammering",
            ),
        ),
    ),
    "Illegal Parking": (
        ("hydrant_blocking", ("hydrant", "fire hydrant")),
        ("crosswalk_blocking", ("crosswalk",)),
        ("bus_stop_blocking", ("bus stop",)),
        (
            "double_parked",
            (
                "double parked",
                "double parking",
                "double parked",
            ),
        ),
    ),
    "Blocked Driveway": (
        (
            "commercial_driveway",
            ("commercial van", "delivery truck", "truck"),
        ),
        ("overnight_blocking", ("overnight", "all night")),
        (
            "residential_driveway",
            ("residential driveway", "driveway", "garage"),
        ),
    ),
    "Rodent": (
        (
            "extermination_request",
            (
                "exterminator",
                "extermination",
                "infestation",
            ),
        ),
        ("rats_seen", ("rats", "rat", "trash bags")),
        ("mouse_condition", ("mouse", "mice", "droppings")),
    ),
    "HEAT/HOT WATER": (
        (
            "no_heat",
            (
                "no heat",
                "without heat",
                "radiator cold",
                "heat not working",
            ),
        ),
        (
            "no_hot_water",
            (
                "no hot water",
                "without hot water",
                "hot water not working",
            ),
        ),
        (
            "intermittent_heat",
            (
                "intermittent heat",
                "heat comes and goes",
                "heat inconsistent",
            ),
        ),
    ),
    "Street Condition": (
        ("pothole", ("pothole", "potholes")),
        (
            "cave_in",
            (
                "cave in",
                "cave-in",
                "sinkhole",
                "collapsed roadway",
            ),
        ),
        (
            "rough_road",
            (
                "uneven",
                "rough road",
                "broken asphalt",
                "road surface",
            ),
        ),
    ),
    "Noise - Street/Sidewalk": (
        (
            "construction",
            ("construction", "drilling", "jackhammer"),
        ),
        (
            "loud_vehicle",
            (
                "car alarm",
                "engine idling",
                "horn",
                "vehicle",
                "muffler",
            ),
        ),
        (
            "bar_noise",
            (
                "bar",
                "club",
                "restaurant",
                "patrons",
                "crowd",
            ),
        ),
    ),
    "UNSANITARY CONDITION": (
        (
            "garbage",
            ("garbage", "trash", "refuse", "debris"),
        ),
        (
            "sewage",
            ("sewage", "feces", "human waste", "overflow"),
        ),
        (
            "pest_waste",
            (
                "rodent",
                "rat",
                "mouse",
                "droppings",
                "animal waste",
            ),
        ),
    ),
    "Abandoned Vehicle": (
        (
            "derelict_vehicle",
            (
                "abandoned",
                "derelict",
                "stripped",
                "wrecked",
            ),
        ),
        (
            "unlicensed_vehicle",
            (
                "no plate",
                "no registration",
                "expired registration",
            ),
        ),
    ),
}

TopicRule `module-attribute` ¶

TopicRule = tuple[str, tuple[str, ...]]

TopicRuleSet `module-attribute` ¶

TopicRuleSet = tuple[TopicRule, ...]

aggregate_by_geography ¶

aggregate_by_geography(
    topic_assignments: list[TopicAssignment], geography: str
) -> list[GeographyTopicSummary]

Aggregate deterministic topic assignments into supported geographies.

Source code in src/nyc311/analysis/_aggregation.py

def aggregate_by_geography(
    topic_assignments: list[TopicAssignment],
    geography: str,
) -> list[GeographyTopicSummary]:
    """Aggregate deterministic topic assignments into supported geographies."""
    if not topic_assignments:
        return []

    grouped_counts: dict[tuple[str, str, str], int] = defaultdict(int)
    geography_totals: dict[tuple[str, str], int] = defaultdict(int)

    for assignment in topic_assignments:
        geography_value = assignment.record.geography_value(geography)
        complaint_type = assignment.record.complaint_type
        grouped_counts[(geography_value, complaint_type, assignment.topic)] += 1
        geography_totals[(geography_value, complaint_type)] += 1

    grouped_topics: dict[tuple[str, str], list[tuple[str, int]]] = defaultdict(list)
    for (geography_value, complaint_type, topic), count in grouped_counts.items():
        grouped_topics[(geography_value, complaint_type)].append((topic, count))

    summaries: list[GeographyTopicSummary] = []
    for (geography_value, complaint_type), topic_counts in sorted(
        grouped_topics.items()
    ):
        ordered_topic_counts = sorted(
            topic_counts, key=lambda item: (-item[1], item[0])
        )
        total_count = geography_totals[(geography_value, complaint_type)]

        for index, (topic, count) in enumerate(ordered_topic_counts, start=1):
            summaries.append(
                GeographyTopicSummary(
                    geography=geography,
                    geography_value=geography_value,
                    complaint_type=complaint_type,
                    topic=topic,
                    complaint_count=count,
                    geography_total_count=total_count,
                    share_of_geography=count / total_count,
                    topic_rank=index,
                    is_dominant_topic=index == 1,
                )
            )

    return summaries

detect_anomalies ¶

detect_anomalies(
    aggregated_data: list[GeographyTopicSummary],
    window: AnalysisWindow,
    *,
    z_threshold: float = 2.0,
) -> list[AnomalyResult]

Score unusually high or low aggregated topic counts via z-scores.

Source code in src/nyc311/analysis/_anomalies.py

def detect_anomalies(
    aggregated_data: list[GeographyTopicSummary],
    window: AnalysisWindow,
    *,
    z_threshold: float = 2.0,
) -> list[AnomalyResult]:
    """Score unusually high or low aggregated topic counts via z-scores."""
    if z_threshold <= 0:
        raise ValueError("z_threshold must be positive.")
    if not aggregated_data:
        return []

    grouped_summaries: dict[tuple[str, str], list[GeographyTopicSummary]] = defaultdict(
        list
    )
    for summary in aggregated_data:
        grouped_summaries[(summary.geography, summary.complaint_type)].append(summary)

    anomaly_results: list[AnomalyResult] = []
    for summaries in grouped_summaries.values():
        ordered_summaries = sorted(
            summaries,
            key=lambda summary: (
                summary.geography_value,
                summary.topic_rank,
                summary.topic,
            ),
        )
        z_scores = _compute_z_scores(
            [summary.complaint_count for summary in ordered_summaries]
        )
        for summary, z_score in zip(ordered_summaries, z_scores, strict=True):
            anomaly_results.append(
                AnomalyResult(
                    geography=summary.geography,
                    geography_value=summary.geography_value,
                    complaint_type=summary.complaint_type,
                    topic=summary.topic,
                    complaint_count=summary.complaint_count,
                    geography_total_count=summary.geography_total_count,
                    share_of_geography=summary.share_of_geography,
                    topic_rank=summary.topic_rank,
                    z_score=z_score,
                    is_anomaly=abs(z_score) >= z_threshold,
                    window_days=window.days,
                    anomaly_threshold=z_threshold,
                )
            )

    return sorted(
        anomaly_results,
        key=lambda result: (
            -abs(result.z_score),
            result.geography,
            result.complaint_type,
            result.geography_value,
            result.topic_rank,
            result.topic,
        ),
    )

analyze_topic_coverage ¶

analyze_topic_coverage(
    service_requests: list[ServiceRequestRecord],
    query: TopicQuery,
    *,
    custom_rules: TopicRuleSet | None = None,
    top_unmatched_n: int = 10,
) -> TopicCoverageReport

Report how much a topic configuration matched versus falling into other.

Source code in src/nyc311/analysis/_coverage.py

def analyze_topic_coverage(
    service_requests: list[ServiceRequestRecord],
    query: TopicQuery,
    *,
    custom_rules: TopicRuleSet | None = None,
    top_unmatched_n: int = 10,
) -> TopicCoverageReport:
    """Report how much a topic configuration matched versus falling into other."""
    matching_records = [
        record
        for record in service_requests
        if record.complaint_type == query.complaint_type
    ]
    assignments = extract_topics(
        matching_records,
        query,
        custom_rules=custom_rules,
    )
    matched_records = sum(
        assignment.topic != _OTHER_TOPIC for assignment in assignments
    )
    other_records = len(assignments) - matched_records
    unmatched_descriptors = Counter(
        _normalize_value(assignment.record.descriptor) or _UNSPECIFIED_TEXT
        for assignment in assignments
        if assignment.topic == _OTHER_TOPIC
    )
    total_records = len(assignments)
    return TopicCoverageReport(
        complaint_type=query.complaint_type,
        total_records=total_records,
        matched_records=matched_records,
        other_records=other_records,
        coverage_rate=0 if total_records == 0 else matched_records / total_records,
        top_unmatched_descriptors=tuple(
            unmatched_descriptors.most_common(top_unmatched_n)
        ),
    )

analyze_resolution_gaps ¶

analyze_resolution_gaps(
    service_requests: list[ServiceRequestRecord],
    resolution_data: list[ServiceRequestRecord],
) -> list[ResolutionGapSummary]

Summarize unresolved complaint share by borough and complaint type.

Source code in src/nyc311/analysis/_resolution.py

def analyze_resolution_gaps(
    service_requests: list[ServiceRequestRecord],
    resolution_data: list[ServiceRequestRecord],
) -> list[ResolutionGapSummary]:
    """Summarize unresolved complaint share by borough and complaint type."""
    if not service_requests:
        return []

    resolved_request_ids = {
        record.service_request_id
        for record in resolution_data
        if record.resolution_description is not None
    }
    grouped_totals: dict[tuple[str, str], int] = defaultdict(int)
    grouped_resolved: dict[tuple[str, str], int] = defaultdict(int)

    for record in service_requests:
        grouping_key = (record.borough, record.complaint_type)
        grouped_totals[grouping_key] += 1
        if (
            record.resolution_description is not None
            or record.service_request_id in resolved_request_ids
        ):
            grouped_resolved[grouping_key] += 1

    summaries: list[ResolutionGapSummary] = []
    for (borough, complaint_type), total_request_count in sorted(
        grouped_totals.items()
    ):
        resolved_request_count = grouped_resolved[(borough, complaint_type)]
        unresolved_request_count = total_request_count - resolved_request_count
        summaries.append(
            ResolutionGapSummary(
                geography="borough",
                geography_value=borough,
                complaint_type=complaint_type,
                total_request_count=total_request_count,
                resolved_request_count=resolved_request_count,
                unresolved_request_count=unresolved_request_count,
                unresolved_share=unresolved_request_count / total_request_count,
                resolution_rate=resolved_request_count / total_request_count,
            )
        )

    return sorted(
        summaries,
        key=lambda summary: (
            -summary.unresolved_share,
            -summary.total_request_count,
            summary.geography_value,
            summary.complaint_type,
        ),
    )

extract_topics ¶

extract_topics(
    service_requests: list[ServiceRequestRecord],
    query: TopicQuery,
    *,
    custom_rules: TopicRuleSet | None = None,
) -> list[TopicAssignment]

Extract deterministic first-pass topics for one complaint type.

Source code in src/nyc311/analysis/_topics.py

def extract_topics(
    service_requests: list[ServiceRequestRecord],
    query: TopicQuery,
    *,
    custom_rules: TopicRuleSet | None = None,
) -> list[TopicAssignment]:
    """Extract deterministic first-pass topics for one complaint type."""
    complaint_type = query.complaint_type
    matching_records = [
        record for record in service_requests if record.complaint_type == complaint_type
    ]
    if not matching_records:
        return []

    rules = _select_topic_rules(complaint_type, custom_rules)
    if rules is None:
        topic_assignments = _extract_fallback_topics(matching_records)
    else:
        topic_assignments = _extract_rule_based_topics(matching_records, rules)
    return _limit_assignments(topic_assignments, top_n=query.top_n)

register_topic_rules ¶

register_topic_rules(
    complaint_type: str, rules: TopicRuleSet
) -> None

Register or replace topic rules for one complaint type.

Source code in src/nyc311/analysis/_topics.py

def register_topic_rules(complaint_type: str, rules: TopicRuleSet) -> None:
    """Register or replace topic rules for one complaint type."""
    normalized_complaint_type = _normalize_value(complaint_type)
    if not normalized_complaint_type:
        raise ValueError("complaint_type must not be empty.")
    _REGISTERED_TOPIC_RULES[normalized_complaint_type] = _normalize_topic_rules(rules)

Geographies¶

nyc311.geographies ¶

Public access to packaged NYC geography layers and boundary helpers.

boundaries_to_dataframe ¶

boundaries_to_dataframe(
    boundaries: BoundaryCollection,
) -> pd.DataFrame

Convert a typed boundary collection into a DataFrame.

Source code in src/nyc311/geographies/_conversions.py

def boundaries_to_dataframe(boundaries: BoundaryCollection) -> pd.DataFrame:
    """Convert a typed boundary collection into a DataFrame."""
    try:
        return toolkit_boundaries_to_dataframe(boundaries)
    except ImportError as exc:  # pragma: no cover - exercised in optional tests
        raise ImportError(
            "pandas is required for nyc311 geography dataframe helpers. "
            "Install it with `pip install nyc311[dataframes]`, "
            "`pip install nyc311[science]`, or `pip install pandas`."
        ) from exc

boundaries_to_geojson ¶

boundaries_to_geojson(
    boundaries: BoundaryCollection,
) -> dict[str, object]

Convert a typed boundary collection into a GeoJSON FeatureCollection.

Source code in src/nyc311/geographies/_conversions.py

def boundaries_to_geojson(boundaries: BoundaryCollection) -> dict[str, object]:
    """Convert a typed boundary collection into a GeoJSON FeatureCollection."""
    return toolkit_boundaries_to_geojson(boundaries)

list_boundary_layers ¶

list_boundary_layers() -> tuple[str, ...]

List the packaged NYC boundary layers shipped with nyc311.

Source code in src/nyc311/geographies/_loaders.py

def list_boundary_layers() -> tuple[str, ...]:
    """List the packaged NYC boundary layers shipped with nyc311."""
    return toolkit_list_boundary_layers()

list_boundary_values ¶

list_boundary_values(layer: str) -> tuple[str, ...]

List the canonical values available for one packaged boundary layer.

Source code in src/nyc311/geographies/_loaders.py

def list_boundary_values(layer: str) -> tuple[str, ...]:
    """List the canonical values available for one packaged boundary layer."""
    return toolkit_list_boundary_values(layer)

load_boundaries ¶

load_boundaries(source: str | Path) -> BoundaryCollection

Load boundaries from a file path or a packaged NYC boundary layer.

Source code in src/nyc311/geographies/_loaders.py

def load_boundaries(source: str | Path) -> BoundaryCollection:
    """Load boundaries from a file path or a packaged NYC boundary layer."""
    return toolkit_load_boundaries(source)

load_nyc_boundaries ¶

load_nyc_boundaries(
    layer: str = "community_district",
    *,
    values: str | tuple[str, ...] | list[str] | None = None,
) -> BoundaryCollection

Load a packaged NYC boundary layer as typed boundary models.

Source code in src/nyc311/geographies/_loaders.py

def load_nyc_boundaries(
    layer: str = "community_district",
    *,
    values: str | tuple[str, ...] | list[str] | None = None,
) -> BoundaryCollection:
    """Load a packaged NYC boundary layer as typed boundary models."""
    return toolkit_load_nyc_boundaries(layer, values=values)

load_nyc_boundaries_geodataframe ¶

load_nyc_boundaries_geodataframe(
    layer: str = "community_district",
    *,
    values: str | tuple[str, ...] | list[str] | None = None,
) -> gpd.GeoDataFrame

Load a packaged NYC boundary layer directly into a GeoDataFrame.

Source code in src/nyc311/geographies/_loaders.py

def load_nyc_boundaries_geodataframe(
    layer: str = "community_district",
    *,
    values: str | tuple[str, ...] | list[str] | None = None,
) -> gpd.GeoDataFrame:
    """Load a packaged NYC boundary layer directly into a GeoDataFrame."""
    return toolkit_load_nyc_boundaries_geodataframe(layer, values=values)

load_nyc_census_tracts ¶

load_nyc_census_tracts(
    *,
    values: str | tuple[str, ...] | list[str] | None = None,
) -> BoundaryCollection

Load the packaged NYC census-tract layer.

Source code in src/nyc311/geographies/_loaders.py

def load_nyc_census_tracts(
    *,
    values: str | tuple[str, ...] | list[str] | None = None,
) -> BoundaryCollection:
    """Load the packaged NYC census-tract layer."""
    return toolkit_load_nyc_census_tracts(values=values)

load_nyc_council_districts ¶

load_nyc_council_districts(
    *,
    values: str | tuple[str, ...] | list[str] | None = None,
) -> BoundaryCollection

Load the packaged NYC city-council-district layer.

Source code in src/nyc311/geographies/_loaders.py

def load_nyc_council_districts(
    *,
    values: str | tuple[str, ...] | list[str] | None = None,
) -> BoundaryCollection:
    """Load the packaged NYC city-council-district layer."""
    return toolkit_load_nyc_council_districts(values=values)

load_nyc_neighborhood_tabulation_areas ¶

load_nyc_neighborhood_tabulation_areas(
    *,
    values: str | tuple[str, ...] | list[str] | None = None,
) -> BoundaryCollection

Load the packaged NYC neighborhood-tabulation-area layer.

Source code in src/nyc311/geographies/_loaders.py

def load_nyc_neighborhood_tabulation_areas(
    *,
    values: str | tuple[str, ...] | list[str] | None = None,
) -> BoundaryCollection:
    """Load the packaged NYC neighborhood-tabulation-area layer."""
    return toolkit_load_nyc_neighborhood_tabulation_areas(values=values)

clip_boundaries_to_bbox ¶

clip_boundaries_to_bbox(
    boundaries: BoundaryCollection,
    *,
    min_longitude: float,
    min_latitude: float,
    max_longitude: float,
    max_latitude: float,
) -> BoundaryCollection

Clip boundary geometries to a longitude/latitude bounding box.

Source code in src/nyc311/geographies/_ops.py

def clip_boundaries_to_bbox(
    boundaries: BoundaryCollection,
    *,
    min_longitude: float,
    min_latitude: float,
    max_longitude: float,
    max_latitude: float,
) -> BoundaryCollection:
    """Clip boundary geometries to a longitude/latitude bounding box."""
    return toolkit_clip_boundaries_to_bbox(
        boundaries,
        min_longitude=min_longitude,
        min_latitude=min_latitude,
        max_longitude=max_longitude,
        max_latitude=max_latitude,
    )

spatially_enrich_records ¶

spatially_enrich_records(
    records: list[ServiceRequestRecord],
    *,
    layer: str = "community_district",
    boundaries: BoundaryCollection | None = None,
) -> gpd.GeoDataFrame

Attach packaged boundary attributes to point-capable service requests.

Source code in src/nyc311/geographies/_ops.py

def spatially_enrich_records(
    records: list[ServiceRequestRecord],
    *,
    layer: str = "community_district",
    boundaries: BoundaryCollection | None = None,
) -> gpd.GeoDataFrame:
    """Attach packaged boundary attributes to point-capable service requests."""
    normalized_layer = normalize_boundary_layer(layer)
    boundary_collection = boundaries or load_nyc_boundaries(normalized_layer)
    boundaries_gdf = _boundary_collection_to_geodataframe(boundary_collection)
    records_gdf = records_to_geodataframe(records)
    return spatial_join_records_to_boundaries(records_gdf, boundaries_gdf)

Samples¶

nyc311.samples ¶

Packaged sample data helpers for nyc311 examples and tests.

load_sample_boundaries ¶

load_sample_boundaries(
    layer: str = "community_district",
) -> BoundaryCollection

Load the subset of packaged boundaries that overlaps the sample records.

Source code in src/nyc311/samples/_loaders.py

def load_sample_boundaries(layer: str = "community_district") -> BoundaryCollection:
    """Load the subset of packaged boundaries that overlaps the sample records."""
    normalized_layer = normalize_boundary_layer(layer)
    sample_boundary_values = load_sample_boundary_values()
    values = sample_boundary_values.get(normalized_layer)
    if values is None:
        raise ValueError(
            "No packaged sample boundaries are available for layer "
            f"{normalized_layer!r}."
        )
    return load_nyc_boundaries(normalized_layer, values=values)

load_sample_service_requests ¶

load_sample_service_requests(
    *, filters: ServiceRequestFilter | None = None
) -> list[ServiceRequestRecord]

Load the packaged sample NYC 311 service-request slice.

Source code in src/nyc311/samples/_loaders.py

def load_sample_service_requests(
    *,
    filters: ServiceRequestFilter | None = None,
) -> list[ServiceRequestRecord]:
    """Load the packaged sample NYC 311 service-request slice."""
    with sample_service_request_path() as sample_path:
        return load_service_requests_from_csv(
            sample_path,
            filters=filters or ServiceRequestFilter(),
        )

Export¶

nyc311.export ¶

Public export helpers for nyc311 outputs.

export_anomalies ¶

export_anomalies(
    data: list[AnomalyResult], target: ExportTarget
) -> Path

Export anomaly detections to a CSV file.

Source code in src/nyc311/export/_csv.py

def export_anomalies(data: list[AnomalyResult], target: ExportTarget) -> Path:
    """Export anomaly detections to a CSV file."""
    if target.format != "csv":
        raise ValueError(
            "export_anomalies() currently supports only CSV output. "
            f"Got format={target.format!r}."
        )

    output_path = target.output_path
    output_path.parent.mkdir(parents=True, exist_ok=True)

    with output_path.open("w", newline="", encoding="utf-8") as csv_file:
        writer = csv.DictWriter(
            csv_file,
            fieldnames=ANOMALY_COLUMNS,
        )
        writer.writeheader()
        for row in data:
            writer.writerow(
                {
                    "geography": row.geography,
                    "geography_value": row.geography_value,
                    "complaint_type": row.complaint_type,
                    "topic": row.topic,
                    "complaint_count": row.complaint_count,
                    "geography_total_count": row.geography_total_count,
                    "share_of_geography": f"{row.share_of_geography:.6f}",
                    "topic_rank": row.topic_rank,
                    "z_score": f"{row.z_score:.6f}",
                    "is_anomaly": str(row.is_anomaly).lower(),
                    "window_days": row.window_days,
                    "anomaly_threshold": f"{row.anomaly_threshold:.6f}",
                }
            )

    return output_path

export_service_requests_csv ¶

export_service_requests_csv(
    data: list[ServiceRequestRecord], target: ExportTarget
) -> Path

Export loaded service-request records to a reproducible CSV snapshot.

Source code in src/nyc311/export/_csv.py

def export_service_requests_csv(
    data: list[ServiceRequestRecord], target: ExportTarget
) -> Path:
    """Export loaded service-request records to a reproducible CSV snapshot."""
    if target.format != "csv":
        raise ValueError(
            "export_service_requests_csv() currently supports only CSV output. "
            f"Got format={target.format!r}."
        )

    output_path = target.output_path
    output_path.parent.mkdir(parents=True, exist_ok=True)

    with output_path.open("w", newline="", encoding="utf-8") as csv_file:
        writer = csv.DictWriter(
            csv_file,
            fieldnames=SERVICE_REQUEST_EXPORT_COLUMNS,
        )
        writer.writeheader()
        for row in data:
            # Prefer full-precision timestamps when present so the export
            # round-trips hour-grain resolution time; fall back to the
            # day-grain dates for records loaded without a time component.
            created_out = (
                row.created_at.isoformat()
                if row.created_at is not None
                else row.created_date.isoformat()
            )
            if row.closed_at is not None:
                closed_out = row.closed_at.isoformat()
            elif row.closed_date is not None:
                closed_out = row.closed_date.isoformat()
            else:
                closed_out = ""
            writer.writerow(
                {
                    "unique_key": row.service_request_id,
                    "created_date": created_out,
                    "complaint_type": row.complaint_type,
                    "descriptor": row.descriptor,
                    "borough": row.borough,
                    "community_district": row.community_district,
                    "resolution_description": row.resolution_description or "",
                    "closed_date": closed_out,
                    "latitude": "" if row.latitude is None else row.latitude,
                    "longitude": "" if row.longitude is None else row.longitude,
                }
            )

    return output_path

export_topic_table ¶

export_topic_table(
    data: list[GeographyTopicSummary], target: ExportTarget
) -> Path

Export geography-topic summaries to a CSV file.

Source code in src/nyc311/export/_csv.py

def export_topic_table(data: list[GeographyTopicSummary], target: ExportTarget) -> Path:
    """Export geography-topic summaries to a CSV file."""
    if target.format != "csv":
        raise ValueError(
            "export_topic_table() currently supports only CSV output. "
            f"Got format={target.format!r}."
        )

    output_path = target.output_path
    output_path.parent.mkdir(parents=True, exist_ok=True)

    with output_path.open("w", newline="", encoding="utf-8") as csv_file:
        writer = csv.DictWriter(
            csv_file,
            fieldnames=TOPIC_SUMMARY_COLUMNS,
        )
        writer.writeheader()
        for row in data:
            writer.writerow(
                {
                    "geography": row.geography,
                    "geography_value": row.geography_value,
                    "complaint_type": row.complaint_type,
                    "topic": row.topic,
                    "complaint_count": row.complaint_count,
                    "geography_total_count": row.geography_total_count,
                    "share_of_geography": f"{row.share_of_geography:.6f}",
                    "topic_rank": row.topic_rank,
                    "is_dominant_topic": str(row.is_dominant_topic).lower(),
                }
            )

    return output_path

export_geojson ¶

export_geojson(
    data: BoundaryGeoJSONExport, target: ExportTarget
) -> Path

Export supported boundary-backed complaint outputs to GeoJSON.

Source code in src/nyc311/export/_geojson.py

def export_geojson(data: BoundaryGeoJSONExport, target: ExportTarget) -> Path:
    """Export supported boundary-backed complaint outputs to GeoJSON."""
    if target.format != "geojson":
        raise ValueError(
            "export_geojson() currently supports only GeoJSON output. "
            f"Got format={target.format!r}."
        )

    output_path = target.output_path
    output_path.parent.mkdir(parents=True, exist_ok=True)

    summary_by_geography = {
        summary.geography_value: summary
        for summary in data.summaries
        if summary.is_dominant_topic
    }
    features: list[dict[str, object]] = []
    for boundary in data.boundaries.features:
        summary = summary_by_geography.get(boundary.geography_value)
        properties: dict[str, object] = {
            "geography": boundary.geography,
            "geography_value": boundary.geography_value,
            **boundary.properties,
        }
        if summary is not None:
            properties.update(
                {
                    "complaint_type": summary.complaint_type,
                    "dominant_topic": summary.topic,
                    "topic_count": summary.complaint_count,
                    "geography_total_count": summary.geography_total_count,
                    "share_of_geography": round(summary.share_of_geography, 6),
                }
            )
        features.append(
            {
                "type": "Feature",
                "geometry": boundary.geometry,
                "properties": properties,
            }
        )

    feature_collection = {"type": "FeatureCollection", "features": features}
    output_path.write_text(
        json.dumps(feature_collection, indent=2, sort_keys=True),
        encoding="utf-8",
    )
    return output_path

export_report_card ¶

export_report_card(
    data: object, target: ExportTarget
) -> Path

Export a markdown report card from summaries, gaps, and anomalies.

Source code in src/nyc311/export/_report.py

def export_report_card(data: object, target: ExportTarget) -> Path:
    """Export a markdown report card from summaries, gaps, and anomalies."""
    if target.format not in {"md", "markdown"}:
        raise ValueError(
            "export_report_card() currently supports only markdown output. "
            f"Got format={target.format!r}."
        )

    topic_summaries, resolution_gaps, anomalies = _coerce_report_card_data(data)
    output_path = target.output_path
    output_path.parent.mkdir(parents=True, exist_ok=True)

    geographies = sorted(
        {
            *[summary.geography_value for summary in topic_summaries],
            *[gap.geography_value for gap in resolution_gaps],
            *[anomaly.geography_value for anomaly in anomalies],
        }
    )
    dominant_topics_by_geography: dict[str, list[GeographyTopicSummary]] = {}
    for summary in topic_summaries:
        if summary.is_dominant_topic:
            dominant_topics_by_geography.setdefault(summary.geography_value, []).append(
                summary
            )
    gaps_by_geography: dict[str, list[ResolutionGapSummary]] = {}
    for gap in resolution_gaps:
        gaps_by_geography.setdefault(gap.geography_value, []).append(gap)
    anomalies_by_geography: dict[str, list[AnomalyResult]] = {}
    for anomaly in anomalies:
        anomalies_by_geography.setdefault(anomaly.geography_value, []).append(anomaly)

    sections = ["# NYC311 Report Card", ""]
    for geography_value in geographies:
        sections.append(f"## {geography_value}")
        sections.append("")

        dominant_topics = sorted(
            dominant_topics_by_geography.get(geography_value, []),
            key=lambda summary: (
                -summary.geography_total_count,
                summary.complaint_type,
            ),
        )
        if dominant_topics:
            sections.append("Dominant topic")
            sections.extend(
                [
                    f"- {dominant_topic.complaint_type}: {dominant_topic.topic} "
                    f"({dominant_topic.complaint_count}/{dominant_topic.geography_total_count}, "
                    f"{dominant_topic.share_of_geography:.1%})"
                    for dominant_topic in dominant_topics[:5]
                ]
            )
        else:
            sections.append("Dominant topic")
            sections.append("- No topic summaries available.")
        sections.append("")

        sections.append("Resolution overview")
        geography_gaps = sorted(
            gaps_by_geography.get(geography_value, []),
            key=lambda gap: (-gap.total_request_count, gap.complaint_type),
        )
        if geography_gaps:
            sections.extend(
                [
                    f"- {gap.complaint_type}: resolution rate {gap.resolution_rate:.1%}, "
                    f"unresolved {gap.unresolved_request_count}/{gap.total_request_count}"
                    for gap in geography_gaps[:5]
                ]
            )
        else:
            sections.append("- No resolution gap summaries available.")
        sections.append("")

        sections.append("Anomaly flags")
        flagged_anomalies = [
            anomaly
            for anomaly in sorted(
                anomalies_by_geography.get(geography_value, []),
                key=lambda anomaly: (
                    -abs(anomaly.z_score),
                    anomaly.topic_rank,
                    anomaly.topic,
                ),
            )
            if anomaly.is_anomaly
        ]
        if flagged_anomalies:
            sections.extend(
                [
                    f"- {anomaly.complaint_type} / {anomaly.topic}: "
                    f"count={anomaly.complaint_count}, z={anomaly.z_score:.2f}"
                    for anomaly in flagged_anomalies[:5]
                ]
            )
        else:
            sections.append("- No anomaly flags above the configured threshold.")
        sections.append("")

    output_path.write_text("\n".join(sections).rstrip() + "\n", encoding="utf-8")
    return output_path

Pipeline¶

nyc311.pipeline ¶

High-level workflow helpers for live fetching and topic-analysis pipelines.

fetch_service_requests ¶

fetch_service_requests(
    *,
    filters: ServiceRequestFilter | None = None,
    socrata_config: SocrataConfig | None = None,
    output: str | Path | None = None,
    cache_dir: Path | str | None = None,
    refresh: bool = False,
    max_cached_records: int | None = None,
) -> list[ServiceRequestRecord]

Fetch a live Socrata slice into memory and optionally stage it as CSV.

This is the intended SDK helper for notebook and workflow users who want to fetch once, inspect records in memory, and only export a local snapshot when they decide the filtered slice is worth keeping.

When cache_dir is set, responses are streamed to a CSV cache first (see :func:nyc311.io.cached_fetch), then loaded—avoid huge slices unless you use chunked analysis on the cache file.

Source code in src/nyc311/pipeline.py

def fetch_service_requests(
    *,
    filters: ServiceRequestFilter | None = None,
    socrata_config: SocrataConfig | None = None,
    output: str | Path | None = None,
    cache_dir: Path | str | None = None,
    refresh: bool = False,
    max_cached_records: int | None = None,
) -> list[ServiceRequestRecord]:
    """Fetch a live Socrata slice into memory and optionally stage it as CSV.

    This is the intended SDK helper for notebook and workflow users who want to
    fetch once, inspect records in memory, and only export a local snapshot when
    they decide the filtered slice is worth keeping.

    When ``cache_dir`` is set, responses are streamed to a CSV cache first (see
    :func:`nyc311.io.cached_fetch`), then loaded—avoid huge slices unless you use
    chunked analysis on the cache file.
    """
    records = load_service_requests(
        socrata_config or SocrataConfig(),
        filters=filters,
        cache_dir=cache_dir,
        refresh=refresh,
        max_cached_records=max_cached_records,
    )
    if output is not None:
        export_service_requests_csv(
            records,
            ExportTarget(format="csv", output_path=Path(output)),
        )
    return records

run_topic_pipeline ¶

run_topic_pipeline(
    source: str | Path | SocrataConfig,
    complaint_type: str,
    *,
    geography: str = "community_district",
    filters: ServiceRequestFilter | None = None,
    top_n: int = 20,
    output: str | Path | None = None,
    output_format: str = "csv",
    boundaries: str | Path | None = None,
) -> list[GeographyTopicSummary]

Run the implemented load-extract-aggregate-export topic workflow.

When output is provided, this helper also writes either a CSV or GeoJSON artifact using the same behavior exposed by the current CLI. The aggregated summaries are always returned to support notebook and workflow use cases.

Source code in src/nyc311/pipeline.py

def run_topic_pipeline(
    source: str | Path | SocrataConfig,
    complaint_type: str,
    *,
    geography: str = "community_district",
    filters: ServiceRequestFilter | None = None,
    top_n: int = 20,
    output: str | Path | None = None,
    output_format: str = "csv",
    boundaries: str | Path | None = None,
) -> list[GeographyTopicSummary]:
    """Run the implemented load-extract-aggregate-export topic workflow.

    When ``output`` is provided, this helper also writes either a CSV or GeoJSON
    artifact using the same behavior exposed by the current CLI. The aggregated
    summaries are always returned to support notebook and workflow use cases.
    """
    service_request_filter = ServiceRequestFilter(
        start_date=filters.start_date if filters is not None else None,
        end_date=filters.end_date if filters is not None else None,
        geography=filters.geography if filters is not None else None,
        complaint_types=(complaint_type,),
    )
    records = load_service_requests(source, filters=service_request_filter)
    assignments = extract_topics(
        records,
        TopicQuery(complaint_type=complaint_type, top_n=top_n),
    )
    summaries = aggregate_by_geography(assignments, geography=geography)

    if output is None:
        return summaries

    target = ExportTarget(format=output_format, output_path=Path(output))
    if target.format == "csv":
        export_topic_table(summaries, target)
        return summaries
    if target.format != "geojson":
        raise ValueError(
            "run_topic_pipeline() currently supports only csv and geojson output. "
            f"Got format={target.format!r}."
        )
    if boundaries is None:
        raise ValueError(
            "run_topic_pipeline() requires boundaries when format='geojson'."
        )

    boundary_collection = load_boundaries(boundaries)
    export_geojson(
        BoundaryGeoJSONExport(
            boundaries=boundary_collection, summaries=tuple(summaries)
        ),
        target,
    )
    return summaries

bulk_fetch ¶

bulk_fetch(
    *,
    complaint_types: tuple[str, ...] = (),
    start_date: date | str | None = None,
    end_date: date | str | None = None,
    cache_dir: Path | str = Path("data/cache"),
    boroughs: tuple[str, ...] | None = None,
    app_token: str | None = None,
    page_size: int = 5000,
    on_progress: Callable[[str, int, int], None]
    | None = None,
) -> list[Path]

Fetch full-city 311 data split by borough for manageable file sizes.

Downloads are split per-borough so that each CSV stays under a few hundred megabytes. Files are written to cache_dir with deterministic names; subsequent calls skip any borough whose file already exists. Each completed CSV is paired with a .meta.json sidecar containing the row count, SHA-256 checksum, fetch timestamp, and the filter parameters used.

The Socrata $select fragment requests the schema: unique_key, created_date, closed_date, complaint_type, descriptor, borough, community_board, resolution_description, latitude, longitude. closed_date (added in v1.0.1 per random-walks/nyc311#20) is nullable — unresolved complaints serialize it as an empty column — which lets downstream resolution-time / SLA analyses compute closed_date - created_date directly without a second round-trip.

Parameters:

Name	Type	Description	Default
`complaint_types`	`tuple[str, ...]`	Optional whitelist of complaint types. When empty, every complaint type is included.	`()`
`start_date`	`date \| str \| None`	Inclusive lower bound on `created_date`. Accepts a `datetime.date` or an ISO-8601 string.	`None`
`end_date`	`date \| str \| None`	Inclusive upper bound on `created_date`. Accepts a `datetime.date` or an ISO-8601 string.	`None`
`cache_dir`	`Path \| str`	Directory to write per-borough CSV files into. The directory is created on demand.	`Path('data/cache')`
`boroughs`	`tuple[str, ...] \| None`	Boroughs to include. Defaults to all five.	`None`
`app_token`	`str \| None`	Socrata app token for higher rate limits.	`None`
`page_size`	`int`	Rows per Socrata HTTP request.	`5000`
`on_progress`	`Callable[[str, int, int], None] \| None`	Optional callback invoked after each HTTP page as `on_progress(borough, page_index, page_row_count)`.	`None`

Returns:

Type	Description
`list[Path]`	Paths to the completed per-borough CSV files in the order the
`list[Path]`	boroughs were processed.

Source code in src/nyc311/pipeline.py

def bulk_fetch(
    *,
    complaint_types: tuple[str, ...] = (),
    start_date: date | str | None = None,
    end_date: date | str | None = None,
    cache_dir: Path | str = Path("data/cache"),
    boroughs: tuple[str, ...] | None = None,
    app_token: str | None = None,
    page_size: int = 5_000,
    on_progress: Callable[[str, int, int], None] | None = None,
) -> list[Path]:
    """Fetch full-city 311 data split by borough for manageable file sizes.

    Downloads are split per-borough so that each CSV stays under a few
    hundred megabytes. Files are written to ``cache_dir`` with
    deterministic names; subsequent calls skip any borough whose file
    already exists. Each completed CSV is paired with a ``.meta.json``
    sidecar containing the row count, SHA-256 checksum, fetch
    timestamp, and the filter parameters used.

    The Socrata ``$select`` fragment requests the schema:
    ``unique_key, created_date, closed_date, complaint_type,
    descriptor, borough, community_board, resolution_description,
    latitude, longitude``. ``closed_date`` (added in v1.0.1 per
    random-walks/nyc311#20) is nullable — unresolved complaints
    serialize it as an empty column — which lets downstream
    resolution-time / SLA analyses compute
    ``closed_date - created_date`` directly without a second
    round-trip.

    Args:
        complaint_types: Optional whitelist of complaint types. When
            empty, every complaint type is included.
        start_date: Inclusive lower bound on ``created_date``. Accepts a
            ``datetime.date`` or an ISO-8601 string.
        end_date: Inclusive upper bound on ``created_date``. Accepts a
            ``datetime.date`` or an ISO-8601 string.
        cache_dir: Directory to write per-borough CSV files into. The
            directory is created on demand.
        boroughs: Boroughs to include. Defaults to all five.
        app_token: Socrata app token for higher rate limits.
        page_size: Rows per Socrata HTTP request.
        on_progress: Optional callback invoked after each HTTP page as
            ``on_progress(borough, page_index, page_row_count)``.

    Returns:
        Paths to the completed per-borough CSV files in the order the
        boroughs were processed.
    """
    target_boroughs = boroughs or SUPPORTED_BOROUGHS
    cache_path = Path(cache_dir)

    parsed_start = (
        date.fromisoformat(start_date) if isinstance(start_date, str) else start_date
    )
    parsed_end = date.fromisoformat(end_date) if isinstance(end_date, str) else end_date

    config = large_socrata_config(
        page_size=page_size,
        app_token=app_token,
    )

    paths: list[Path] = []
    for borough_name in target_boroughs:
        filters = ServiceRequestFilter(
            start_date=parsed_start,
            end_date=parsed_end,
            geography=GeographyFilter(geography="borough", value=borough_name),
            complaint_types=complaint_types,
        )

        def _on_page(page_idx: int, row_count: int, _boro: str = borough_name) -> None:
            if on_progress is not None:
                on_progress(_boro, page_idx, row_count)

        result_path = cached_fetch(
            config,
            filters,
            cache_dir=cache_path,
            on_page=_on_page,
        )
        paths.append(result_path)

    return paths

DataFrames¶

nyc311.dataframes ¶

Optional pandas conversion helpers for notebook and data-science workflows.

anomalies_to_dataframe ¶

anomalies_to_dataframe(
    anomalies: list[AnomalyResult],
) -> Any

Convert anomaly results into a DataFrame.

Source code in src/nyc311/dataframes/_analysis.py

def anomalies_to_dataframe(anomalies: list[AnomalyResult]) -> Any:
    """Convert anomaly results into a DataFrame."""
    pd = require_pandas()
    return pd.DataFrame.from_records(
        [
            {
                "geography": anomaly.geography,
                "geography_value": anomaly.geography_value,
                "complaint_type": anomaly.complaint_type,
                "topic": anomaly.topic,
                "complaint_count": anomaly.complaint_count,
                "geography_total_count": anomaly.geography_total_count,
                "share_of_geography": anomaly.share_of_geography,
                "topic_rank": anomaly.topic_rank,
                "z_score": anomaly.z_score,
                "is_anomaly": anomaly.is_anomaly,
                "window_days": anomaly.window_days,
                "anomaly_threshold": anomaly.anomaly_threshold,
            }
            for anomaly in anomalies
        ],
        columns=ANOMALY_COLUMNS,
    )

coverage_to_dataframe ¶

coverage_to_dataframe(
    reports: list[TopicCoverageReport],
) -> Any

Convert topic-coverage reports into a DataFrame.

Source code in src/nyc311/dataframes/_analysis.py

def coverage_to_dataframe(reports: list[TopicCoverageReport]) -> Any:
    """Convert topic-coverage reports into a DataFrame."""
    pd = require_pandas()
    return pd.DataFrame.from_records(
        [
            {
                "complaint_type": report.complaint_type,
                "total_records": report.total_records,
                "matched_records": report.matched_records,
                "other_records": report.other_records,
                "coverage_rate": report.coverage_rate,
                "top_unmatched_descriptors": list(report.top_unmatched_descriptors),
            }
            for report in reports
        ],
        columns=TOPIC_COVERAGE_COLUMNS,
    )

gaps_to_dataframe ¶

gaps_to_dataframe(gaps: list[ResolutionGapSummary]) -> Any

Convert resolution-gap summaries into a DataFrame.

Source code in src/nyc311/dataframes/_analysis.py

def gaps_to_dataframe(gaps: list[ResolutionGapSummary]) -> Any:
    """Convert resolution-gap summaries into a DataFrame."""
    pd = require_pandas()
    return pd.DataFrame.from_records(
        [
            {
                "geography": gap.geography,
                "geography_value": gap.geography_value,
                "complaint_type": gap.complaint_type,
                "total_request_count": gap.total_request_count,
                "resolved_request_count": gap.resolved_request_count,
                "unresolved_request_count": gap.unresolved_request_count,
                "unresolved_share": gap.unresolved_share,
                "resolution_rate": gap.resolution_rate,
            }
            for gap in gaps
        ],
        columns=RESOLUTION_GAP_COLUMNS,
    )

summaries_to_dataframe ¶

summaries_to_dataframe(
    summaries: list[GeographyTopicSummary],
) -> Any

Convert geography-topic summaries into a DataFrame.

Source code in src/nyc311/dataframes/_analysis.py

def summaries_to_dataframe(summaries: list[GeographyTopicSummary]) -> Any:
    """Convert geography-topic summaries into a DataFrame."""
    pd = require_pandas()
    return pd.DataFrame.from_records(
        [
            {
                "geography": summary.geography,
                "geography_value": summary.geography_value,
                "complaint_type": summary.complaint_type,
                "topic": summary.topic,
                "complaint_count": summary.complaint_count,
                "geography_total_count": summary.geography_total_count,
                "share_of_geography": summary.share_of_geography,
                "topic_rank": summary.topic_rank,
                "is_dominant_topic": summary.is_dominant_topic,
            }
            for summary in summaries
        ],
        columns=TOPIC_SUMMARY_COLUMNS,
    )

assignments_to_dataframe ¶

assignments_to_dataframe(
    assignments: list[TopicAssignment],
) -> Any

Convert topic assignments into a DataFrame.

Source code in src/nyc311/dataframes/_records.py

def assignments_to_dataframe(assignments: list[TopicAssignment]) -> Any:
    """Convert topic assignments into a DataFrame."""
    pd = require_pandas()
    dataframe = pd.DataFrame.from_records(
        [
            {
                "service_request_id": assignment.record.service_request_id,
                "created_date": assignment.record.created_date,
                "complaint_type": assignment.record.complaint_type,
                "descriptor": assignment.record.descriptor,
                "borough": assignment.record.borough,
                "community_district": assignment.record.community_district,
                "resolution_description": assignment.record.resolution_description,
                "latitude": assignment.record.latitude,
                "longitude": assignment.record.longitude,
                "topic": assignment.topic,
                "normalized_text": assignment.normalized_text,
            }
            for assignment in assignments
        ],
        columns=TOPIC_ASSIGNMENT_COLUMNS,
    )
    if "created_date" in dataframe:
        dataframe["created_date"] = pd.to_datetime(dataframe["created_date"])
    return dataframe

dataframe_to_records ¶

dataframe_to_records(
    dataframe: Any,
) -> list[ServiceRequestRecord]

Convert a DataFrame back into typed service-request records.

Source code in src/nyc311/dataframes/_records.py

def dataframe_to_records(dataframe: Any) -> list[ServiceRequestRecord]:
    """Convert a DataFrame back into typed service-request records."""
    pd = require_pandas()
    required_columns = set(SERVICE_REQUEST_REQUIRED_DATAFRAME_COLUMNS)
    missing_columns = sorted(required_columns.difference(dataframe.columns))
    if missing_columns:
        missing = ", ".join(missing_columns)
        raise ValueError(
            f"DataFrame is missing required service-request columns: {missing}."
        )

    records: list[ServiceRequestRecord] = []
    for row in dataframe.to_dict(orient="records"):
        raw_created_date = row["created_date"]
        if hasattr(raw_created_date, "to_pydatetime"):
            created_date = raw_created_date.to_pydatetime().date()
        elif isinstance(raw_created_date, date):
            created_date = raw_created_date
        else:
            created_date = date.fromisoformat(str(raw_created_date))

        resolution_description = row.get("resolution_description")
        normalized_resolution = (
            None
            if resolution_description in (None, "") or pd.isna(resolution_description)
            else str(resolution_description)
        )
        raw_closed_date = row.get("closed_date")
        # pd.isna handles pd.NaT directly; it must come first because
        # pd.NaT passes isinstance(x, datetime.date).
        if raw_closed_date is None or pd.isna(raw_closed_date):
            closed_date: date | None = None
        elif hasattr(raw_closed_date, "to_pydatetime"):
            closed_date = raw_closed_date.to_pydatetime().date()
        elif isinstance(raw_closed_date, date):
            closed_date = raw_closed_date
        else:
            closed_date = date.fromisoformat(str(raw_closed_date))
        latitude = row.get("latitude")
        longitude = row.get("longitude")
        records.append(
            ServiceRequestRecord(
                service_request_id=str(row["service_request_id"]),
                created_date=created_date,
                complaint_type=str(row["complaint_type"]),
                descriptor=str(row["descriptor"]),
                borough=str(row["borough"]),
                community_district=str(row["community_district"]),
                resolution_description=normalized_resolution,
                latitude=None if latitude is None or pd.isna(latitude) else latitude,
                longitude=None
                if longitude is None or pd.isna(longitude)
                else longitude,
                closed_date=closed_date,
            )
        )
    return records

records_to_dataframe ¶

records_to_dataframe(
    records: list[ServiceRequestRecord],
) -> Any

Convert service-request records into a notebook-friendly DataFrame.

Source code in src/nyc311/dataframes/_records.py

def records_to_dataframe(records: list[ServiceRequestRecord]) -> Any:
    """Convert service-request records into a notebook-friendly DataFrame."""
    pd = require_pandas()
    dataframe = pd.DataFrame.from_records(
        [
            {
                "service_request_id": record.service_request_id,
                "created_date": record.created_date,
                "complaint_type": record.complaint_type,
                "descriptor": record.descriptor,
                "borough": record.borough,
                "community_district": record.community_district,
                "resolution_description": record.resolution_description,
                "closed_date": record.closed_date,
                "latitude": record.latitude,
                "longitude": record.longitude,
            }
            for record in records
        ],
        columns=SERVICE_REQUEST_DATAFRAME_COLUMNS,
    )
    if "created_date" in dataframe:
        dataframe["created_date"] = pd.to_datetime(dataframe["created_date"])
    if "closed_date" in dataframe:
        dataframe["closed_date"] = pd.to_datetime(dataframe["closed_date"])
    return dataframe

resample_and_fill ¶

resample_and_fill(
    dataframe: Any,
    freq: str,
    *,
    method: Literal["zero", "ffill", "bfill"] = "zero",
) -> Any

Resample a DatetimeIndex-indexed frame and fill missing bins.

method='zero' fills missing values with 0 (typical for counts).

Source code in src/nyc311/dataframes/_timeseries.py

def resample_and_fill(
    dataframe: Any,
    freq: str,
    *,
    method: Literal["zero", "ffill", "bfill"] = "zero",
) -> Any:
    """Resample a DatetimeIndex-indexed frame and fill missing bins.

    ``method='zero'`` fills missing values with ``0`` (typical for counts).
    """
    pd = require_pandas()
    if dataframe is None or getattr(dataframe, "empty", True):
        return dataframe

    if not isinstance(dataframe.index, pd.DatetimeIndex):
        raise TypeError("resample_and_fill() expects a DatetimeIndex on the DataFrame.")

    freq = _normalize_pandas_freq(freq)
    resampled = dataframe.resample(freq).sum()
    if method == "zero":
        return resampled.fillna(0)
    if method == "ffill":
        return resampled.ffill()
    if method == "bfill":
        return resampled.bfill()
    raise ValueError(f"Unsupported method: {method!r}.")

to_panel ¶

to_panel(
    records: list[ServiceRequestRecord],
    *,
    freq: str = "D",
    geography: str = "borough",
) -> Any

Return a panel of complaint counts indexed by (geography_value, period).

Columns are complaint types. Use .xs("BROOKLYN", level=0) for one area.

Source code in src/nyc311/dataframes/_timeseries.py

def to_panel(
    records: list[ServiceRequestRecord],
    *,
    freq: str = "D",
    geography: str = "borough",
) -> Any:
    """Return a panel of complaint counts indexed by ``(geography_value, period)``.

    Columns are complaint types. Use ``.xs("BROOKLYN", level=0)`` for one area.
    """
    pd = require_pandas()
    if not records:
        return pd.DataFrame()

    freq = _normalize_pandas_freq(freq)
    dataframe = records_to_dataframe(records)
    geo_series = [record.geography_value(geography) for record in records]
    dataframe = dataframe.assign(_geography=geo_series)

    counts = (
        dataframe.groupby(
            [
                "_geography",
                pd.Grouper(key="created_date", freq=freq),
                "complaint_type",
            ]
        )
        .size()
        .unstack(fill_value=0)
    )
    counts.index.names = ("geography_value", "created_date")
    return counts.sort_index()

to_timeseries ¶

to_timeseries(
    records: list[ServiceRequestRecord], *, freq: str = "D"
) -> Any

Return complaint counts per period with a :class:~pandas.DatetimeIndex.

Columns are complaint types (wide format). Suitable for .plot(), .rolling(), and .resample().

Source code in src/nyc311/dataframes/_timeseries.py

def to_timeseries(
    records: list[ServiceRequestRecord],
    *,
    freq: str = "D",
) -> Any:
    """Return complaint counts per period with a :class:`~pandas.DatetimeIndex`.

    Columns are complaint types (wide format). Suitable for ``.plot()``, ``.rolling()``,
    and ``.resample()``.
    """
    pd = require_pandas()
    if not records:
        return pd.DataFrame()

    freq = _normalize_pandas_freq(freq)
    dataframe = records_to_dataframe(records)
    counts = (
        dataframe.groupby([pd.Grouper(key="created_date", freq=freq), "complaint_type"])
        .size()
        .unstack(fill_value=0)
    )
    counts = counts.sort_index()
    counts.index.name = "created_date"
    return counts

to_topic_timeseries ¶

to_topic_timeseries(
    assignments: list[TopicAssignment], *, freq: str = "D"
) -> Any

Like :func:to_timeseries but aggregates extracted topic labels.

Source code in src/nyc311/dataframes/_timeseries.py

def to_topic_timeseries(
    assignments: list[TopicAssignment],
    *,
    freq: str = "D",
) -> Any:
    """Like :func:`to_timeseries` but aggregates extracted topic labels."""
    pd = require_pandas()
    if not assignments:
        return pd.DataFrame()

    freq = _normalize_pandas_freq(freq)
    dataframe = pd.DataFrame(
        {
            "created_date": pd.to_datetime(
                [a.record.created_date for a in assignments]
            ),
            "topic": [a.topic for a in assignments],
        }
    )
    counts = (
        dataframe.groupby([pd.Grouper(key="created_date", freq=freq), "topic"])
        .size()
        .unstack(fill_value=0)
    )
    counts = counts.sort_index()
    counts.index.name = "created_date"
    return counts

Spatial¶

nyc311.spatial ¶

Optional geospatial helpers built on top of the typed nyc311 models.

The nyc311.spatial module is the GeoDataFrame-flavoured sibling of nyc311.geographies — it loads boundary layers and records as geopandas frames, spatially joins records to boundaries, and materialises typed summaries as map-ready GeoDataFrames.

.. note::

For polygon-centroid points (distance-band spatial weights, Moran's I / LISA, nearest-neighbour joins, choropleth label placement), nyc311 deliberately does not ship a centroid helper in this module. Use upstream instead:

.. code-block:: python

   from nyc_geo_toolkit import (
       centroids_from_boundaries,
       load_nyc_boundaries,
   )

   cbs = load_nyc_boundaries("community_district")
   # representative=True keeps the point inside the polygon —
   # matters for non-convex NYC shorelines.
   points = centroids_from_boundaries(cbs, representative=True)

Shipped as a first-class helper in nyc-geo-toolkit v0.4.0 (on PyPI as v0.4.1 since 2026-04-21). Requires the [spatial] extra on nyc-geo-toolkit for the shapely dependency. See also :func:nyc311.temporal.centroids_from_boundaries, which returns a shapely-free dict[str, (lat, lon)] suitable for direct use with :func:nyc311.temporal.build_distance_weights.

load_boundaries_geodataframe ¶

load_boundaries_geodataframe(
    source: str | Path | BoundaryCollection | None = None,
    *,
    layer: str | None = None,
) -> Any

Load supported boundaries from a path, collection, or packaged layer.

.. note::

Need polygon centroids for spatial weights / Moran's I / label placement? Upstream :func:nyc_geo_toolkit.centroids_from_boundaries (v0.4+) converts any polygon BoundaryCollection into a Point BoundaryCollection, preserving geography / vintage / properties. Pair with representative=True for non-convex polygons. See the :mod:nyc311.spatial module docstring for the full recipe.

Source code in src/nyc311/spatial/_boundaries.py

def load_boundaries_geodataframe(
    source: str | Path | BoundaryCollection | None = None,
    *,
    layer: str | None = None,
) -> Any:
    """Load supported boundaries from a path, collection, or packaged layer.

    .. note::

       Need polygon centroids for spatial weights / Moran's I / label
       placement? Upstream :func:`nyc_geo_toolkit.centroids_from_boundaries`
       (v0.4+) converts any polygon ``BoundaryCollection`` into a Point
       ``BoundaryCollection``, preserving geography / vintage / properties.
       Pair with ``representative=True`` for non-convex polygons. See the
       :mod:`nyc311.spatial` module docstring for the full recipe.
    """
    if layer is not None:
        if source is not None:
            raise ValueError("Pass either source or layer, not both.")
        return _load_nyc_boundaries_geodataframe(layer)

    if source is None:
        raise TypeError("load_boundaries_geodataframe() requires source or layer.")
    if isinstance(source, BoundaryCollection):
        return _boundary_collection_to_geodataframe(source)
    if isinstance(source, Path) or Path(source).exists():
        return _boundary_collection_to_geodataframe(load_boundary_collection(source))
    try:
        return _load_nyc_boundaries_geodataframe(str(source))
    except ValueError:
        return _boundary_collection_to_geodataframe(load_boundary_collection(source))

spatial_join_records_to_boundaries ¶

spatial_join_records_to_boundaries(
    records_gdf: Any, boundaries_gdf: Any
) -> Any

Join point records to boundary polygons without clobbering record columns.

Source code in src/nyc311/spatial/_joins.py

def spatial_join_records_to_boundaries(records_gdf: Any, boundaries_gdf: Any) -> Any:
    """Join point records to boundary polygons without clobbering record columns."""
    geopandas, _ = require_geospatial_stack()
    aligned_boundaries = boundaries_gdf
    if (
        getattr(records_gdf, "crs", None)
        and getattr(boundaries_gdf, "crs", None)
        and records_gdf.crs != boundaries_gdf.crs
    ):
        aligned_boundaries = boundaries_gdf.to_crs(records_gdf.crs)

    renamed_boundaries = aligned_boundaries.rename(
        columns={
            column_name: f"boundary_{column_name}"
            for column_name in aligned_boundaries.columns
            if column_name != "geometry"
        }
    )
    joined = geopandas.sjoin(
        records_gdf,
        renamed_boundaries,
        how="left",
        predicate="within",
    )
    if "index_right" in joined.columns:
        joined = joined.drop(columns="index_right")
    return joined

records_to_geodataframe ¶

records_to_geodataframe(
    records: list[ServiceRequestRecord],
) -> Any

Convert point-capable service-request records into a GeoDataFrame.

Source code in src/nyc311/spatial/_points.py

def records_to_geodataframe(records: list[ServiceRequestRecord]) -> Any:
    """Convert point-capable service-request records into a GeoDataFrame."""
    geopandas, _ = require_geospatial_stack()
    records_with_coordinates = [
        record
        for record in records
        if record.latitude is not None and record.longitude is not None
    ]
    if not records_with_coordinates:
        return geopandas.GeoDataFrame(
            columns=(*SERVICE_REQUEST_DATAFRAME_COLUMNS, "geometry"),
            geometry="geometry",
            crs="EPSG:4326",
        )

    dataframe = records_to_dataframe(records_with_coordinates).copy()
    return geopandas.GeoDataFrame(
        dataframe,
        geometry=geopandas.points_from_xy(
            dataframe["longitude"],
            dataframe["latitude"],
        ),
        crs="EPSG:4326",
    )

summaries_to_geodataframe ¶

summaries_to_geodataframe(
    summaries: list[Any],
    boundaries_gdf: Any = None,
    *,
    layer: str | None = None,
) -> Any

Merge aggregated geography summaries onto boundary geometries.

Source code in src/nyc311/spatial/_summaries.py

def summaries_to_geodataframe(
    summaries: list[Any],
    boundaries_gdf: Any = None,
    *,
    layer: str | None = None,
) -> Any:
    """Merge aggregated geography summaries onto boundary geometries."""
    geopandas, _ = require_geospatial_stack()
    if boundaries_gdf is None:
        if layer is None:
            if not summaries:
                raise ValueError(
                    "summaries_to_geodataframe() requires boundaries_gdf or layer "
                    "when summaries is empty."
                )
            layer = summaries[0].geography
        boundaries_gdf = load_boundaries_geodataframe(layer=layer)
    if "geography" not in boundaries_gdf.columns:
        raise ValueError("boundaries_gdf must include a geography column.")
    if "geography_value" not in boundaries_gdf.columns:
        raise ValueError("boundaries_gdf must include a geography_value column.")

    summary_dataframe = summaries_to_dataframe(summaries)
    merged = boundaries_gdf.merge(
        summary_dataframe,
        on=["geography", "geography_value"],
        how="left",
    )
    return geopandas.GeoDataFrame(merged, geometry="geometry", crs=boundaries_gdf.crs)

Plotting¶

nyc311.plotting ¶

Optional in-memory plotting helpers for NYC boundary maps.

plot_boundary_choropleth ¶

plot_boundary_choropleth(
    geodataframe: Any,
    *,
    column: str,
    title: str,
    cmap: str = "viridis",
    categorical: bool = False,
    add_basemap: bool = False,
    figsize: tuple[float, float] = (10, 8),
    outline_gdf: Any | None = None,
    legend_title: str | None = None,
    legend_kwds: dict[str, Any] | None = None,
) -> Any

Render a choropleth map and return the matplotlib figure.

Source code in src/nyc311/plotting.py

def plot_boundary_choropleth(
    geodataframe: Any,
    *,
    column: str,
    title: str,
    cmap: str = "viridis",
    categorical: bool = False,
    add_basemap: bool = False,
    figsize: tuple[float, float] = (10, 8),
    outline_gdf: Any | None = None,
    legend_title: str | None = None,
    legend_kwds: dict[str, Any] | None = None,
) -> Any:
    """Render a choropleth map and return the matplotlib figure."""
    plt = _require_matplotlib()
    plt.style.use("seaborn-v0_8-whitegrid")
    plot_gdf = _prepare_plot_frame(geodataframe, add_basemap=add_basemap)
    outline_frame = _prepare_plot_frame(outline_gdf, add_basemap=add_basemap)
    if plot_gdf is None:
        raise TypeError("plot_boundary_choropleth() requires a geodataframe.")
    _figure, axes = plt.subplots(figsize=figsize)
    if categorical:
        effective_legend_kwds: dict[str, Any] = {
            "loc": "upper left",
            "bbox_to_anchor": (1.02, 1),
            "frameon": True,
            "borderaxespad": 0.0,
        }
    else:
        # Continuous choropleth uses a matplotlib colorbar (not a legend).
        effective_legend_kwds = {"shrink": 0.72, "label": column}
    if legend_kwds is not None:
        effective_legend_kwds.update(legend_kwds)
    missing_mask = plot_gdf[column].isna()
    missing_frame = plot_gdf[missing_mask]
    data_frame = plot_gdf[~missing_mask]
    if not data_frame.empty:
        data_frame.plot(
            ax=axes,
            column=column,
            legend=True,
            cmap=cmap,
            categorical=categorical,
            edgecolor="#334155",
            linewidth=0.7,
            alpha=0.7 if add_basemap else 1.0,
            legend_kwds=effective_legend_kwds,
        )
    if not missing_frame.empty:
        missing_frame.plot(
            ax=axes,
            color="#d4d4d8",
            edgecolor="#94a3b8",
            linewidth=0.7,
        )
        legend = axes.get_legend()
        if categorical and legend is not None:
            matplotlib_patches = import_module("matplotlib.patches")
            handles = list(legend.legend_handles)
            labels = [text.get_text() for text in legend.get_texts()]
            handles.append(
                matplotlib_patches.Patch(
                    facecolor="#d4d4d8",
                    edgecolor="#94a3b8",
                    label="No data",
                )
            )
            labels.append("No data")
            legend.remove()
            axes.legend(handles, labels, **effective_legend_kwds)
    if outline_frame is not None:
        outline_frame.boundary.plot(
            ax=axes,
            color="#0f172a",
            linewidth=1.15 if not add_basemap else 0.9,
            alpha=0.75,
        )
    axes.set_axis_off()
    if add_basemap:
        contextily = _require_contextily()
        contextily.add_basemap(
            axes,
            source=contextily.providers.CartoDB.Positron,
            attribution_size=6,
        )
    _style_legend(axes, title=legend_title)
    _finish_axes(axes, title=title)
    return axes.figure

plot_boundary_preview ¶

plot_boundary_preview(
    boundaries_gdf: Any,
    *,
    title: str,
    points_gdf: Any | None = None,
    add_basemap: bool = False,
    figsize: tuple[float, float] = (10, 8),
) -> Any

Render boundary outlines and optional points, then return the figure.

Source code in src/nyc311/plotting.py

def plot_boundary_preview(
    boundaries_gdf: Any,
    *,
    title: str,
    points_gdf: Any | None = None,
    add_basemap: bool = False,
    figsize: tuple[float, float] = (10, 8),
) -> Any:
    """Render boundary outlines and optional points, then return the figure."""
    plt = _require_matplotlib()
    plt.style.use("seaborn-v0_8-whitegrid")
    boundary_frame = _prepare_plot_frame(boundaries_gdf, add_basemap=add_basemap)
    point_frame = _prepare_plot_frame(points_gdf, add_basemap=add_basemap)
    if boundary_frame is None:
        raise TypeError("plot_boundary_preview() requires boundaries_gdf.")

    _figure, axes = plt.subplots(figsize=figsize)
    boundary_frame.boundary.plot(
        ax=axes,
        color="#1f2937",
        linewidth=1.25,
    )
    if point_frame is not None and not point_frame.empty:
        point_frame.plot(
            ax=axes,
            color="#dc2626",
            markersize=18,
            alpha=0.8,
        )
    if add_basemap:
        contextily = _require_contextily()
        contextily.add_basemap(
            axes,
            source=contextily.providers.CartoDB.Positron,
            attribution_size=6,
        )
    _finish_axes(axes, title=title)
    return axes.figure

plot_boundary_point_groups ¶

plot_boundary_point_groups(
    boundaries_gdf: Any,
    *,
    title: str,
    matched_points_gdf: Any | None = None,
    unmatched_points_gdf: Any | None = None,
    context_gdf: Any | None = None,
    outline_gdf: Any | None = None,
    matched_label: str = "Matched",
    unmatched_label: str = "Unmatched",
    add_basemap: bool = False,
    figsize: tuple[float, float] = (10, 8),
) -> Any

Render categorized points over highlighted boundaries and optional context.

Source code in src/nyc311/plotting.py

def plot_boundary_point_groups(
    boundaries_gdf: Any,
    *,
    title: str,
    matched_points_gdf: Any | None = None,
    unmatched_points_gdf: Any | None = None,
    context_gdf: Any | None = None,
    outline_gdf: Any | None = None,
    matched_label: str = "Matched",
    unmatched_label: str = "Unmatched",
    add_basemap: bool = False,
    figsize: tuple[float, float] = (10, 8),
) -> Any:
    """Render categorized points over highlighted boundaries and optional context."""
    plt = _require_matplotlib()
    plt.style.use("seaborn-v0_8-whitegrid")
    boundary_frame = _prepare_plot_frame(boundaries_gdf, add_basemap=add_basemap)
    context_frame = _prepare_plot_frame(context_gdf, add_basemap=add_basemap)
    outline_frame = _prepare_plot_frame(outline_gdf, add_basemap=add_basemap)
    matched_frame = _prepare_plot_frame(matched_points_gdf, add_basemap=add_basemap)
    unmatched_frame = _prepare_plot_frame(unmatched_points_gdf, add_basemap=add_basemap)
    if boundary_frame is None:
        raise TypeError("plot_boundary_point_groups() requires boundaries_gdf.")

    figure, axes = plt.subplots(figsize=figsize)
    if context_frame is not None and not context_frame.empty:
        context_frame.plot(
            ax=axes,
            color="#f1f5f9",
            edgecolor="#cbd5e1",
            linewidth=0.5,
        )
    boundary_frame.boundary.plot(
        ax=axes,
        color="#334155",
        linewidth=1.25,
    )
    if outline_frame is not None and not outline_frame.empty:
        outline_frame.boundary.plot(
            ax=axes,
            color="#0f172a",
            linewidth=1.15 if not add_basemap else 0.9,
            alpha=0.75,
        )
    if matched_frame is not None and not matched_frame.empty:
        matched_style = _point_style(len(matched_frame), matched=True)
        matched_frame.plot(
            ax=axes,
            color="#16a34a",
            markersize=matched_style["markersize"],
            alpha=matched_style["alpha"],
            label=matched_label,
        )
    if unmatched_frame is not None and not unmatched_frame.empty:
        unmatched_style = _point_style(len(unmatched_frame), matched=False)
        unmatched_frame.plot(
            ax=axes,
            color="#dc2626",
            markersize=unmatched_style["markersize"],
            marker="x",
            linewidth=unmatched_style["linewidth"],
            alpha=unmatched_style["alpha"],
            label=unmatched_label,
        )
    if add_basemap:
        contextily = _require_contextily()
        contextily.add_basemap(
            axes,
            source=contextily.providers.CartoDB.Positron,
            attribution_size=6,
        )
    legend_handles, _legend_labels = axes.get_legend_handles_labels()
    if legend_handles:
        axes.legend(loc="lower left", frameon=True)
        _style_legend(axes)
    _finish_axes(axes, title=title)
    return figure

plot_timeseries ¶

plot_timeseries(
    dataframe: Any,
    *,
    title: str,
    figsize: tuple[float, float] = (12, 5),
    footnote: str | None = None,
) -> Any

Line chart for a :class:~pandas.DataFrame with a DatetimeIndex or created_date column.

Source code in src/nyc311/plotting.py

def plot_timeseries(
    dataframe: Any,
    *,
    title: str,
    figsize: tuple[float, float] = (12, 5),
    footnote: str | None = None,
) -> Any:
    """Line chart for a :class:`~pandas.DataFrame` with a DatetimeIndex or ``created_date`` column."""
    plt = _require_matplotlib()
    pd = import_module("pandas")
    plt.style.use("seaborn-v0_8-whitegrid")
    _figure, axes = plt.subplots(figsize=figsize)
    plot_df = dataframe
    if isinstance(dataframe.index, pd.DatetimeIndex):
        plot_df = dataframe
    elif "created_date" in getattr(dataframe, "columns", ()):
        plot_df = dataframe.set_index("created_date").sort_index()
    else:
        plot_df = dataframe.copy()
    plot_df.plot(ax=axes, legend=True)
    axes.set_title(title, pad=12)
    axes.set_xlabel("")
    axes.grid(True, alpha=0.3)
    axes.figure.patch.set_facecolor("white")
    if footnote:
        fig = axes.figure
        fig.subplots_adjust(bottom=0.16)
        fig.text(
            0.5,
            0.02,
            footnote,
            ha="center",
            fontsize=8,
            color="#555",
            va="bottom",
            wrap=True,
        )
    return axes.figure

plot_complaint_heatmap ¶

plot_complaint_heatmap(
    dataframe: Any,
    *,
    title: str,
    time_column: str = "created_date",
    figsize: tuple[float, float] = (10, 6),
) -> Any

Hour-of-day x day-of-week density heatmap (expects datetime resolution in time_column).

Source code in src/nyc311/plotting.py

def plot_complaint_heatmap(
    dataframe: Any,
    *,
    title: str,
    time_column: str = "created_date",
    figsize: tuple[float, float] = (10, 6),
) -> Any:
    """Hour-of-day x day-of-week density heatmap (expects datetime resolution in ``time_column``)."""
    plt = _require_matplotlib()
    pd = import_module("pandas")
    np = import_module("numpy")
    plt.style.use("seaborn-v0_8-whitegrid")
    if time_column not in dataframe.columns:
        raise ValueError(f"DataFrame must include column {time_column!r}.")

    times = pd.to_datetime(dataframe[time_column])
    hour = times.dt.hour
    weekday = times.dt.dayofweek
    grid = (
        pd.DataFrame({"hour": hour, "weekday": weekday})
        .assign(n=1)
        .groupby(["weekday", "hour"], observed=False)["n"]
        .sum()
        .unstack(fill_value=0)
        .reindex(index=range(7), columns=range(24), fill_value=0)
    )
    labels = ["Mon", "Tue", "Wed", "Thu", "Fri", "Sat", "Sun"]
    _figure, axes = plt.subplots(figsize=figsize)
    im = axes.imshow(np.asarray(grid), aspect="auto", cmap="YlOrRd", origin="lower")
    axes.set_xticks(range(0, 24, 2))
    axes.set_yticks(range(7))
    axes.set_yticklabels(labels)
    axes.set_xlabel("Hour of day")
    axes.set_ylabel("Weekday")
    axes.set_title(title, pad=12)
    plt.colorbar(im, ax=axes, fraction=0.046, pad=0.04, label="Complaints")
    axes.figure.patch.set_facecolor("white")
    return axes.figure

plot_stacked_area ¶

plot_stacked_area(
    dataframe: Any,
    *,
    title: str,
    top_n: int = 8,
    figsize: tuple[float, float] = (12, 6),
) -> Any

Stacked area chart of the top-N columns (by total) over a DatetimeIndex.

Source code in src/nyc311/plotting.py

def plot_stacked_area(
    dataframe: Any,
    *,
    title: str,
    top_n: int = 8,
    figsize: tuple[float, float] = (12, 6),
) -> Any:
    """Stacked area chart of the top-N columns (by total) over a DatetimeIndex."""
    plt = _require_matplotlib()
    pd = import_module("pandas")
    plt.style.use("seaborn-v0_8-whitegrid")
    if not isinstance(dataframe.index, pd.DatetimeIndex):
        raise TypeError(
            "plot_stacked_area() expects a DatetimeIndex-indexed DataFrame."
        )
    totals = dataframe.sum().sort_values(ascending=False)
    cols = list(totals.head(top_n).index)
    sub = dataframe[cols].fillna(0)
    if sub.shape[1] == 0:
        sub = dataframe.fillna(0)
    mdates = import_module("matplotlib.dates")
    _figure, axes = plt.subplots(figsize=figsize)
    xnum = mdates.date2num(pd.DatetimeIndex(sub.index).to_pydatetime())
    axes.stackplot(
        xnum,
        *[sub[c].to_numpy() for c in sub.columns],
        labels=list(sub.columns),
        alpha=0.85,
    )
    axes.xaxis_date()
    ax_fig = axes.figure
    ax_fig.autofmt_xdate()
    axes.legend(loc="upper left", bbox_to_anchor=(1.02, 1), frameon=True)
    axes.set_title(title, pad=12)
    axes.set_xlabel("")
    axes.grid(True, alpha=0.25)
    axes.figure.patch.set_facecolor("white")
    return axes.figure

plot_bar_counts ¶

plot_bar_counts(
    labels: list[str],
    counts: list[float],
    *,
    title: str,
    horizontal: bool = False,
    figsize: tuple[float, float] = (10, 6),
) -> Any

Simple bar chart for categorical counts.

Source code in src/nyc311/plotting.py

def plot_bar_counts(
    labels: list[str],
    counts: list[float],
    *,
    title: str,
    horizontal: bool = False,
    figsize: tuple[float, float] = (10, 6),
) -> Any:
    """Simple bar chart for categorical counts."""
    plt = _require_matplotlib()
    plt.style.use("seaborn-v0_8-whitegrid")
    _figure, axes = plt.subplots(figsize=figsize)
    if horizontal:
        axes.barh(labels, counts, color="#3b82f6", edgecolor="#1e40af", linewidth=0.5)
    else:
        axes.bar(labels, counts, color="#3b82f6", edgecolor="#1e40af", linewidth=0.5)
        plt.setp(axes.xaxis.get_majorticklabels(), rotation=45, ha="right")
    axes.set_title(title, pad=12)
    axes.grid(True, axis="y", alpha=0.3)
    axes.figure.patch.set_facecolor("white")
    return axes.figure

plot_complaint_scatter ¶

plot_complaint_scatter(
    points_gdf: Any,
    *,
    boundaries_gdf: Any | None = None,
    title: str,
    column: str = "complaint_type",
    add_basemap: bool = False,
    figsize: tuple[float, float] = (12, 10),
    legend_top_n: int | None = None,
) -> Any

Scatter plot of points colored by column over optional boundary outlines.

Source code in src/nyc311/plotting.py

def plot_complaint_scatter(
    points_gdf: Any,
    *,
    boundaries_gdf: Any | None = None,
    title: str,
    column: str = "complaint_type",
    add_basemap: bool = False,
    figsize: tuple[float, float] = (12, 10),
    legend_top_n: int | None = None,
) -> Any:
    """Scatter plot of points colored by ``column`` over optional boundary outlines."""
    plt = _require_matplotlib()
    plt.style.use("seaborn-v0_8-whitegrid")
    point_frame = _prepare_plot_frame(points_gdf, add_basemap=add_basemap)
    boundary_frame = _prepare_plot_frame(boundaries_gdf, add_basemap=add_basemap)
    if point_frame is None or point_frame.empty:
        raise TypeError(
            "plot_complaint_scatter() requires a non-empty points GeoDataFrame."
        )

    _figure, axes = plt.subplots(figsize=figsize)
    scatter_legend_kwds = {"bbox_to_anchor": (1.02, 1), "loc": "upper left"}
    if boundary_frame is not None and not boundary_frame.empty:
        boundary_frame.boundary.plot(ax=axes, color="#0f172a", linewidth=0.8, alpha=0.7)
    point_frame.plot(
        ax=axes,
        column=column,
        legend=True,
        markersize=12,
        alpha=0.5,
        categorical=True,
        cmap="tab20",
        legend_kwds=scatter_legend_kwds,
    )
    if add_basemap:
        contextily = _require_contextily()
        contextily.add_basemap(
            axes,
            source=contextily.providers.CartoDB.Positron,
            attribution_size=6,
        )
    if legend_top_n is not None:
        _apply_top_n_categorical_point_legend(
            axes,
            point_frame=point_frame,
            column=column,
            top_n=legend_top_n,
            legend_kwds=scatter_legend_kwds,
        )
    else:
        _style_legend(axes)
    _finish_axes(axes, title=title)
    return axes.figure

plot_hero_banner ¶

plot_hero_banner(
    points_gdf: Any,
    *,
    boundaries_gdf: Any | None = None,
    title: str,
    bbox: tuple[float, float, float, float] | None = None,
    column: str = "complaint_type",
    figsize: tuple[float, float] = (16, 5),
    legend_top_n: int | None = None,
) -> Any

Wide horizontal map with OSM basemap, points, and boundaries (Web Mercator).

Source code in src/nyc311/plotting.py

def plot_hero_banner(
    points_gdf: Any,
    *,
    boundaries_gdf: Any | None = None,
    title: str,
    bbox: tuple[float, float, float, float] | None = None,
    column: str = "complaint_type",
    figsize: tuple[float, float] = (16, 5),
    legend_top_n: int | None = None,
) -> Any:
    """Wide horizontal map with OSM basemap, points, and boundaries (Web Mercator)."""
    plt = _require_matplotlib()
    plt.style.use("seaborn-v0_8-whitegrid")
    point_frame = points_gdf
    boundary_frame = boundaries_gdf
    if bbox is not None:
        minx, miny, maxx, maxy = bbox
        # GeoPandas uses float coordinate bounds; slice() in typeshed is int-only.
        x_slice = slice(cast(Any, minx), cast(Any, maxx))
        y_slice = slice(cast(Any, miny), cast(Any, maxy))
        point_frame = points_gdf.cx[x_slice, y_slice]
        if boundaries_gdf is not None:
            boundary_frame = boundaries_gdf.cx[x_slice, y_slice]

    point_frame = _prepare_plot_frame(point_frame, add_basemap=True)
    boundary_frame = _prepare_plot_frame(boundary_frame, add_basemap=True)
    if point_frame is None or point_frame.empty:
        raise TypeError("plot_hero_banner() requires a non-empty points GeoDataFrame.")

    _figure, axes = plt.subplots(figsize=figsize)
    hero_legend_kwds = {"bbox_to_anchor": (1.01, 1), "loc": "upper left", "fontsize": 8}
    if boundary_frame is not None and not boundary_frame.empty:
        boundary_frame.boundary.plot(
            ax=axes, color="#0f172a", linewidth=0.9, alpha=0.85
        )
    point_frame.plot(
        ax=axes,
        column=column,
        legend=True,
        markersize=8,
        alpha=0.65,
        categorical=True,
        cmap="tab20",
        legend_kwds=hero_legend_kwds,
    )
    contextily = _require_contextily()
    contextily.add_basemap(
        axes,
        source=contextily.providers.CartoDB.Positron,
        attribution_size=5,
    )
    if legend_top_n is not None:
        _apply_top_n_categorical_point_legend(
            axes,
            point_frame=point_frame,
            column=column,
            top_n=legend_top_n,
            legend_kwds=hero_legend_kwds,
        )
    else:
        _style_legend(axes)
    axes.set_axis_off()
    axes.set_title(title, pad=10, fontsize=14, fontweight="600")
    axes.figure.patch.set_facecolor("white")
    return axes.figure

Presets¶

nyc311.presets ¶

Reusable preset builders for common nyc311 example and workflow inputs.

build_filter ¶

build_filter(
    *,
    start_date: date | str,
    end_date: date | str,
    geography: str = "borough",
    geography_value: str = models.BOROUGH_BROOKLYN,
    complaint_types: tuple[str, ...] = (),
) -> models.ServiceRequestFilter

Build a typed service-request filter from string-friendly inputs.

Source code in src/nyc311/presets.py

def build_filter(
    *,
    start_date: date | str,
    end_date: date | str,
    geography: str = "borough",
    geography_value: str = models.BOROUGH_BROOKLYN,
    complaint_types: tuple[str, ...] = (),
) -> models.ServiceRequestFilter:
    """Build a typed service-request filter from string-friendly inputs."""
    return models.ServiceRequestFilter(
        start_date=_coerce_date(start_date),
        end_date=_coerce_date(end_date),
        geography=models.GeographyFilter(geography, geography_value),
        complaint_types=complaint_types,
    )

brooklyn_borough_filter ¶

brooklyn_borough_filter(
    *,
    start_date: date | str,
    end_date: date | str,
    complaint_types: tuple[str, ...] = (),
) -> models.ServiceRequestFilter

Build a borough-level Brooklyn filter.

Source code in src/nyc311/presets.py

def brooklyn_borough_filter(
    *,
    start_date: date | str,
    end_date: date | str,
    complaint_types: tuple[str, ...] = (),
) -> models.ServiceRequestFilter:
    """Build a borough-level Brooklyn filter."""
    return build_filter(
        start_date=start_date,
        end_date=end_date,
        geography="borough",
        geography_value=models.BOROUGH_BROOKLYN,
        complaint_types=complaint_types,
    )

manhattan_borough_filter ¶

manhattan_borough_filter(
    *,
    start_date: date | str,
    end_date: date | str,
    complaint_types: tuple[str, ...] = (),
) -> models.ServiceRequestFilter

Build a borough-level Manhattan filter.

Source code in src/nyc311/presets.py

def manhattan_borough_filter(
    *,
    start_date: date | str,
    end_date: date | str,
    complaint_types: tuple[str, ...] = (),
) -> models.ServiceRequestFilter:
    """Build a borough-level Manhattan filter."""
    return build_filter(
        start_date=start_date,
        end_date=end_date,
        geography="borough",
        geography_value=models.BOROUGH_MANHATTAN,
        complaint_types=complaint_types,
    )

small_socrata_config ¶

small_socrata_config(
    *,
    page_size: int = 500,
    max_pages: int | None = 1,
    app_token: str | None = None,
) -> models.SocrataConfig

Build a small Socrata config suited to examples and local iteration.

Source code in src/nyc311/presets.py

def small_socrata_config(
    *,
    page_size: int = 500,
    max_pages: int | None = 1,
    app_token: str | None = None,
) -> models.SocrataConfig:
    """Build a small Socrata config suited to examples and local iteration."""
    return models.SocrataConfig(
        app_token=app_token,
        page_size=page_size,
        max_pages=max_pages,
    )

large_socrata_config ¶

large_socrata_config(
    *,
    page_size: int = 5000,
    max_pages: int | None = None,
    app_token: str | None = None,
    request_timeout_seconds: float = 300.0,
    created_date_sort: Literal["asc", "desc"] = "asc",
) -> models.SocrataConfig

Build a high-throughput Socrata config for bulk downloads (e.g. full history).

Default page_size is 5,000 rows per request so each HTTP round-trip stays smaller than very large pages, with a five-minute read timeout per request. Use created_date_sort='desc' when you want the most recent rows first (e.g. capped smoke samples).

Source code in src/nyc311/presets.py

def large_socrata_config(
    *,
    page_size: int = 5_000,
    max_pages: int | None = None,
    app_token: str | None = None,
    request_timeout_seconds: float = 300.0,
    created_date_sort: Literal["asc", "desc"] = "asc",
) -> models.SocrataConfig:
    """Build a high-throughput Socrata config for bulk downloads (e.g. full history).

    Default ``page_size`` is 5,000 rows per request so each HTTP round-trip stays
    smaller than very large pages, with a five-minute read timeout per request.
    Use ``created_date_sort='desc'`` when you want the most recent rows first
    (e.g. capped smoke samples).
    """
    return models.SocrataConfig(
        app_token=app_token,
        page_size=page_size,
        max_pages=max_pages,
        request_timeout_seconds=request_timeout_seconds,
        created_date_sort=created_date_sort,
    )

smoke_socrata_config ¶

smoke_socrata_config(
    *,
    page_size: int = 5000,
    app_token: str | None = None,
    request_timeout_seconds: float = 120.0,
) -> models.SocrataConfig

Recent-first Socrata config used with a per-borough row cap (see about-the-data --preset smoke).

Source code in src/nyc311/presets.py

def smoke_socrata_config(
    *,
    page_size: int = 5_000,
    app_token: str | None = None,
    request_timeout_seconds: float = 120.0,
) -> models.SocrataConfig:
    """Recent-first Socrata config used with a per-borough row cap (see about-the-data ``--preset smoke``)."""
    return models.SocrataConfig(
        app_token=app_token,
        page_size=page_size,
        max_pages=None,
        request_timeout_seconds=request_timeout_seconds,
        created_date_sort="desc",
    )

Factors¶

nyc311.factors ¶

Composable factor pipeline for NYC 311 complaint analysis.

EquityGapFactor ¶

Bases: Factor

Disparity metric: ratio of unit resolution time to citywide median.

Values above 1.0 indicate the unit resolves complaints slower than the citywide median; below 1.0, faster.

Source code in src/nyc311/factors/_advanced.py

class EquityGapFactor(Factor):
    """Disparity metric: ratio of unit resolution time to citywide median.

    Values above 1.0 indicate the unit resolves complaints slower
    than the citywide median; below 1.0, faster.
    """

    name = "equity_gap"
    dtype = "float"

    def __init__(self, citywide_median_days: float) -> None:
        """Initialize the equity gap factor.

        Args:
            citywide_median_days: The citywide median resolution
                time in days, used as the denominator for the ratio.
        """
        self._citywide_median = citywide_median_days

    def compute(self, context: FactorContext) -> float:
        """Return the resolution-time equity ratio for ``context``.

        Returns:
            ``unit_median / citywide_median``, or ``0.0`` when no
            resolved complaints exist or the citywide median is
            non-positive.
        """
        resolved = [
            c for c in context.complaints if c.resolution_description is not None
        ]
        if not resolved or self._citywide_median <= 0:
            return 0.0
        days = [
            max(float((context.time_window_end - c.created_date).days), 0.0)
            for c in resolved
        ]
        unit_median = median(days)
        return unit_median / self._citywide_median

name `class-attribute` `instance-attribute` ¶

name = 'equity_gap'

dtype `class-attribute` `instance-attribute` ¶

dtype = 'float'

compute ¶

compute(context: FactorContext) -> float

Return the resolution-time equity ratio for context.

Returns:

Type	Description
`float`	`unit_median / citywide_median`, or `0.0` when no
`float`	resolved complaints exist or the citywide median is
`float`	non-positive.

Source code in src/nyc311/factors/_advanced.py

def compute(self, context: FactorContext) -> float:
    """Return the resolution-time equity ratio for ``context``.

    Returns:
        ``unit_median / citywide_median``, or ``0.0`` when no
        resolved complaints exist or the citywide median is
        non-positive.
    """
    resolved = [
        c for c in context.complaints if c.resolution_description is not None
    ]
    if not resolved or self._citywide_median <= 0:
        return 0.0
    days = [
        max(float((context.time_window_end - c.created_date).days), 0.0)
        for c in resolved
    ]
    unit_median = median(days)
    return unit_median / self._citywide_median

SpatialLagFactor ¶

Bases: Factor

Spatial lag of complaint counts: weighted average of neighbors.

Uses a precomputed spatial weights dict and a values dict to compute the weighted sum of neighboring unit values for the focal unit.

Source code in src/nyc311/factors/_advanced.py

class SpatialLagFactor(Factor):
    """Spatial lag of complaint counts: weighted average of neighbors.

    Uses a precomputed spatial weights dict and a values dict to
    compute the weighted sum of neighboring unit values for the
    focal unit.
    """

    name = "spatial_lag"
    dtype = "float"

    def __init__(
        self,
        weights: dict[str, dict[str, float]],
        values: dict[str, float],
    ) -> None:
        """Initialize the spatial lag factor.

        Args:
            weights: Nested dict ``{unit_a: {unit_b: weight}}`` of
                spatial weights (typically row-standardized).
            values: Mapping ``{unit_id: numeric_value}`` for the
                variable to spatially lag.
        """
        self._weights = weights
        self._values = values

    def compute(self, context: FactorContext) -> float:
        """Return the spatial lag for the context's geographic unit.

        Returns:
            The weighted sum of neighboring values.  Returns ``0.0``
            when the unit has no neighbors in the weights dict.
        """
        unit = context.geography_value
        nbrs = self._weights.get(unit, {})
        if not nbrs:
            return 0.0
        return sum(w * self._values.get(nb, 0.0) for nb, w in nbrs.items())

name `class-attribute` `instance-attribute` ¶

name = 'spatial_lag'

dtype `class-attribute` `instance-attribute` ¶

dtype = 'float'

compute ¶

compute(context: FactorContext) -> float

Return the spatial lag for the context's geographic unit.

Returns:

Type	Description
`float`	The weighted sum of neighboring values. Returns `0.0`
`float`	when the unit has no neighbors in the weights dict.

Source code in src/nyc311/factors/_advanced.py

def compute(self, context: FactorContext) -> float:
    """Return the spatial lag for the context's geographic unit.

    Returns:
        The weighted sum of neighboring values.  Returns ``0.0``
        when the unit has no neighbors in the weights dict.
    """
    unit = context.geography_value
    nbrs = self._weights.get(unit, {})
    if not nbrs:
        return 0.0
    return sum(w * self._values.get(nb, 0.0) for nb, w in nbrs.items())

Factor ¶

Bases: ABC

Abstract base for a single named computation over a FactorContext.

Source code in src/nyc311/factors/_base.py

class Factor(ABC):
    """Abstract base for a single named computation over a FactorContext."""

    name: str
    dtype: Literal["float", "str", "bool", "int"]

    @abstractmethod
    def compute(self, context: FactorContext) -> float | str | bool | int:
        """Return the computed value for *context*."""

name `instance-attribute` ¶

name: str

dtype `instance-attribute` ¶

dtype: Literal['float', 'str', 'bool', 'int']

compute `abstractmethod` ¶

compute(context: FactorContext) -> float | str | bool | int

Return the computed value for context.

Source code in src/nyc311/factors/_base.py

@abstractmethod
def compute(self, context: FactorContext) -> float | str | bool | int:
    """Return the computed value for *context*."""

FactorContext `dataclass` ¶

Row-level context for factor computation.

Each context represents one geographic unit (community district, NTA, borough) over one time window. Factors compute a single value from this context.

Source code in src/nyc311/factors/_base.py

@dataclass(frozen=True, slots=True)
class FactorContext:
    """Row-level context for factor computation.

    Each context represents one geographic unit (community district, NTA,
    borough) over one time window.  Factors compute a single value from
    this context.
    """

    geography: str
    geography_value: str
    complaints: tuple[ServiceRequestRecord, ...]
    time_window_start: date
    time_window_end: date
    total_population: int | None = None
    extras: dict[str, Any] | None = None

geography `instance-attribute` ¶

geography: str

geography_value `instance-attribute` ¶

geography_value: str

complaints `instance-attribute` ¶

complaints: tuple[ServiceRequestRecord, ...]

time_window_start `instance-attribute` ¶

time_window_start: date

time_window_end `instance-attribute` ¶

time_window_end: date

total_population `class-attribute` `instance-attribute` ¶

total_population: int | None = None

extras `class-attribute` `instance-attribute` ¶

extras: dict[str, Any] | None = None

Pipeline ¶

Immutable builder that executes factors over contexts.

Pipeline never mutates in place: :meth:add returns a new pipeline with the factor appended.

Source code in src/nyc311/factors/_base.py

class Pipeline:
    """Immutable builder that executes factors over contexts.

    ``Pipeline`` never mutates in place: :meth:`add` returns a **new**
    pipeline with the factor appended.
    """

    __slots__ = ("_factors",)

    def __init__(self, factors: tuple[Factor, ...] = ()) -> None:
        self._factors = factors

    def add(self, factor: Factor) -> Pipeline:
        """Return a new pipeline with ``factor`` appended.

        Args:
            factor: The factor to append. Must define a unique ``name``.

        Returns:
            A new :class:`Pipeline` whose ``factors`` tuple ends with
            ``factor``. The receiver is left unmodified.
        """
        return Pipeline((*self._factors, factor))

    @property
    def factors(self) -> tuple[Factor, ...]:
        """The ordered factors in this pipeline."""
        return self._factors

    def as_factor_factory_estimate(
        self,
        panel: Any,
        *,
        family: str = "did",
        method: str = "twfe",
        outcome: str | None = None,
        **engine_kwargs: Any,
    ) -> Any:
        """Run a factor-factory engine on ``panel`` as a Pipeline continuation.

        Additive bridge: the pipeline itself is not executed here.
        Instead, the call dispatches into
        ``factor_factory.engines.<family>.estimate``, returning a
        factor-factory ``<Family>Results`` object that downstream code
        can chain off.

        Args:
            panel: A :class:`factor_factory.tidy.Panel`. Typically
                produced by
                :meth:`nyc311.temporal.PanelDataset.to_factor_factory_panel`.
            family: Engine-family module name under
                ``factor_factory.engines``. Defaults to ``"did"``.
            method: Registry key for a specific adapter inside the
                family (e.g. ``"twfe"``, ``"cs"``). Defaults to
                ``"twfe"``.
            outcome: Outcome column on the Panel. When ``None``, the
                engine falls back to ``panel.outcome_col``.
            **engine_kwargs: Additional kwargs forwarded to the engine's
                ``estimate`` dispatcher.

        Returns:
            A factor-factory ``<Family>Results`` object.

        Raises:
            ImportError: If factor-factory is not installed or the
                requested engine family's optional dependencies are
                missing.
        """
        from nyc311.factors._factor_factory import dispatch_factor_factory_engine

        return dispatch_factor_factory_engine(
            panel,
            family=family,
            method=method,
            outcome=outcome,
            **engine_kwargs,
        )

    def run(self, contexts: Iterable[FactorContext]) -> PipelineResult:
        """Execute all factors across ``contexts`` and return results.

        Iterates over each context once and evaluates every factor against
        it, producing a columnar :class:`PipelineResult` keyed by factor
        name.

        Args:
            contexts: An iterable of :class:`FactorContext` instances. Each
                context corresponds to one geographic-unit / time-window
                row in the final result.

        Returns:
            A :class:`PipelineResult` whose ``columns`` map factor names to
            value tuples and whose ``geography_ids`` tuple aligns with
            those columns positionally.
        """
        context_list = list(contexts)
        geography_ids: list[str] = []
        columns: dict[str, list[Any]] = {f.name: [] for f in self._factors}

        for ctx in context_list:
            geography_ids.append(ctx.geography_value)
            for factor in self._factors:
                columns[factor.name].append(factor.compute(ctx))

        return PipelineResult(
            columns={name: tuple(values) for name, values in columns.items()},
            geography_ids=tuple(geography_ids),
        )

factors `property` ¶

factors: tuple[Factor, ...]

The ordered factors in this pipeline.

add ¶

add(factor: Factor) -> Pipeline

Return a new pipeline with factor appended.

Parameters:

Name	Type	Description	Default
`factor`	`Factor`	The factor to append. Must define a unique `name`.	required

Returns:

Type	Description
`Pipeline`	A new :class:`Pipeline` whose `factors` tuple ends with
`Pipeline`	`factor`. The receiver is left unmodified.

Source code in src/nyc311/factors/_base.py

def add(self, factor: Factor) -> Pipeline:
    """Return a new pipeline with ``factor`` appended.

    Args:
        factor: The factor to append. Must define a unique ``name``.

    Returns:
        A new :class:`Pipeline` whose ``factors`` tuple ends with
        ``factor``. The receiver is left unmodified.
    """
    return Pipeline((*self._factors, factor))

as_factor_factory_estimate ¶

as_factor_factory_estimate(
    panel: Any,
    *,
    family: str = "did",
    method: str = "twfe",
    outcome: str | None = None,
    **engine_kwargs: Any,
) -> Any

Run a factor-factory engine on panel as a Pipeline continuation.

Additive bridge: the pipeline itself is not executed here. Instead, the call dispatches into factor_factory.engines.<family>.estimate, returning a factor-factory <Family>Results object that downstream code can chain off.

Parameters:

Name	Type	Description	Default
`panel`	`Any`	A :class:`factor_factory.tidy.Panel`. Typically produced by :meth:`nyc311.temporal.PanelDataset.to_factor_factory_panel`.	required
`family`	`str`	Engine-family module name under `factor_factory.engines`. Defaults to `"did"`.	`'did'`
`method`	`str`	Registry key for a specific adapter inside the family (e.g. `"twfe"`, `"cs"`). Defaults to `"twfe"`.	`'twfe'`
`outcome`	`str \| None`	Outcome column on the Panel. When `None`, the engine falls back to `panel.outcome_col`.	`None`
`**engine_kwargs`	`Any`	Additional kwargs forwarded to the engine's `estimate` dispatcher.	`{}`

Returns:

Type	Description
`Any`	A factor-factory `<Family>Results` object.

Raises:

Type	Description
`ImportError`	If factor-factory is not installed or the requested engine family's optional dependencies are missing.

Source code in src/nyc311/factors/_base.py

def as_factor_factory_estimate(
    self,
    panel: Any,
    *,
    family: str = "did",
    method: str = "twfe",
    outcome: str | None = None,
    **engine_kwargs: Any,
) -> Any:
    """Run a factor-factory engine on ``panel`` as a Pipeline continuation.

    Additive bridge: the pipeline itself is not executed here.
    Instead, the call dispatches into
    ``factor_factory.engines.<family>.estimate``, returning a
    factor-factory ``<Family>Results`` object that downstream code
    can chain off.

    Args:
        panel: A :class:`factor_factory.tidy.Panel`. Typically
            produced by
            :meth:`nyc311.temporal.PanelDataset.to_factor_factory_panel`.
        family: Engine-family module name under
            ``factor_factory.engines``. Defaults to ``"did"``.
        method: Registry key for a specific adapter inside the
            family (e.g. ``"twfe"``, ``"cs"``). Defaults to
            ``"twfe"``.
        outcome: Outcome column on the Panel. When ``None``, the
            engine falls back to ``panel.outcome_col``.
        **engine_kwargs: Additional kwargs forwarded to the engine's
            ``estimate`` dispatcher.

    Returns:
        A factor-factory ``<Family>Results`` object.

    Raises:
        ImportError: If factor-factory is not installed or the
            requested engine family's optional dependencies are
            missing.
    """
    from nyc311.factors._factor_factory import dispatch_factor_factory_engine

    return dispatch_factor_factory_engine(
        panel,
        family=family,
        method=method,
        outcome=outcome,
        **engine_kwargs,
    )

run ¶

run(contexts: Iterable[FactorContext]) -> PipelineResult

Execute all factors across contexts and return results.

Iterates over each context once and evaluates every factor against it, producing a columnar :class:PipelineResult keyed by factor name.

Parameters:

Name	Type	Description	Default
`contexts`	`Iterable[FactorContext]`	An iterable of :class:`FactorContext` instances. Each context corresponds to one geographic-unit / time-window row in the final result.	required

Returns:

Name	Type	Description
`A`	`PipelineResult`	class:`PipelineResult` whose `columns` map factor names to
	`PipelineResult`	value tuples and whose `geography_ids` tuple aligns with
	`PipelineResult`	those columns positionally.

Source code in src/nyc311/factors/_base.py

def run(self, contexts: Iterable[FactorContext]) -> PipelineResult:
    """Execute all factors across ``contexts`` and return results.

    Iterates over each context once and evaluates every factor against
    it, producing a columnar :class:`PipelineResult` keyed by factor
    name.

    Args:
        contexts: An iterable of :class:`FactorContext` instances. Each
            context corresponds to one geographic-unit / time-window
            row in the final result.

    Returns:
        A :class:`PipelineResult` whose ``columns`` map factor names to
        value tuples and whose ``geography_ids`` tuple aligns with
        those columns positionally.
    """
    context_list = list(contexts)
    geography_ids: list[str] = []
    columns: dict[str, list[Any]] = {f.name: [] for f in self._factors}

    for ctx in context_list:
        geography_ids.append(ctx.geography_value)
        for factor in self._factors:
            columns[factor.name].append(factor.compute(ctx))

    return PipelineResult(
        columns={name: tuple(values) for name, values in columns.items()},
        geography_ids=tuple(geography_ids),
    )

PipelineResult `dataclass` ¶

Columnar result set produced by :meth:Pipeline.run.

Source code in src/nyc311/factors/_base.py

@dataclass(frozen=True, slots=True)
class PipelineResult:
    """Columnar result set produced by :meth:`Pipeline.run`."""

    columns: dict[str, tuple[Any, ...]]
    geography_ids: tuple[str, ...]

    def to_records(self) -> tuple[dict[str, Any], ...]:
        """Convert to a tuple of row dictionaries.

        Returns:
            A tuple where each element is a dict containing
            ``geography_id`` plus one key per factor in the pipeline. The
            row order matches :attr:`geography_ids`.
        """
        records: list[dict[str, Any]] = []
        for i, geography_id in enumerate(self.geography_ids):
            row: dict[str, Any] = {"geography_id": geography_id}
            for col_name, values in self.columns.items():
                row[col_name] = values[i]
            records.append(row)
        return tuple(records)

    def to_dataframe(self) -> Any:
        """Convert to a pandas DataFrame indexed by ``geography_id``.

        Returns:
            A ``pandas.DataFrame`` with one row per geographic unit and
            one column per factor, indexed by ``geography_id``.

        Raises:
            ImportError: If pandas is not installed. Install the optional
                dataframes extra with ``pip install nyc311[dataframes]``.
        """
        try:
            import pandas as pd
        except ImportError as exc:
            message = (
                "pandas is required for to_dataframe(). "
                "Install it with: pip install nyc311[dataframes]"
            )
            raise ImportError(message) from exc

        data: dict[str, Any] = {"geography_id": self.geography_ids, **self.columns}
        return pd.DataFrame(data).set_index("geography_id")

columns `instance-attribute` ¶

columns: dict[str, tuple[Any, ...]]

geography_ids `instance-attribute` ¶

geography_ids: tuple[str, ...]

to_records ¶

to_records() -> tuple[dict[str, Any], ...]

Convert to a tuple of row dictionaries.

Returns:

Type	Description
`dict[str, Any]`	A tuple where each element is a dict containing
`...`	`geography_id` plus one key per factor in the pipeline. The
`tuple[dict[str, Any], ...]`	row order matches :attr:`geography_ids`.

Source code in src/nyc311/factors/_base.py

def to_records(self) -> tuple[dict[str, Any], ...]:
    """Convert to a tuple of row dictionaries.

    Returns:
        A tuple where each element is a dict containing
        ``geography_id`` plus one key per factor in the pipeline. The
        row order matches :attr:`geography_ids`.
    """
    records: list[dict[str, Any]] = []
    for i, geography_id in enumerate(self.geography_ids):
        row: dict[str, Any] = {"geography_id": geography_id}
        for col_name, values in self.columns.items():
            row[col_name] = values[i]
        records.append(row)
    return tuple(records)

to_dataframe ¶

to_dataframe() -> Any

Convert to a pandas DataFrame indexed by geography_id.

Returns:

Type	Description
`Any`	A `pandas.DataFrame` with one row per geographic unit and
`Any`	one column per factor, indexed by `geography_id`.

Raises:

Type	Description
`ImportError`	If pandas is not installed. Install the optional dataframes extra with `pip install nyc311[dataframes]`.

Source code in src/nyc311/factors/_base.py

def to_dataframe(self) -> Any:
    """Convert to a pandas DataFrame indexed by ``geography_id``.

    Returns:
        A ``pandas.DataFrame`` with one row per geographic unit and
        one column per factor, indexed by ``geography_id``.

    Raises:
        ImportError: If pandas is not installed. Install the optional
            dataframes extra with ``pip install nyc311[dataframes]``.
    """
    try:
        import pandas as pd
    except ImportError as exc:
        message = (
            "pandas is required for to_dataframe(). "
            "Install it with: pip install nyc311[dataframes]"
        )
        raise ImportError(message) from exc

    data: dict[str, Any] = {"geography_id": self.geography_ids, **self.columns}
    return pd.DataFrame(data).set_index("geography_id")

AnomalyScoreFactor ¶

Bases: Factor

Z-score of this unit's complaint volume.

Because the z-score is relative to the full set of contexts in the pipeline run, this factor stores intermediate counts and finalizes during :meth:Pipeline.run. As a stateless compromise it uses a fixed population_mean and population_std provided at construction time.

Returns 0.0 when population_std is zero.

Source code in src/nyc311/factors/_builtin.py

class AnomalyScoreFactor(Factor):
    """Z-score of this unit's complaint volume.

    Because the z-score is relative to the **full set of contexts** in
    the pipeline run, this factor stores intermediate counts and
    finalizes during :meth:`Pipeline.run`.  As a stateless compromise
    it uses a fixed *population_mean* and *population_std* provided at
    construction time.

    Returns ``0.0`` when *population_std* is zero.
    """

    name = "anomaly_score"
    dtype = "float"

    def __init__(
        self,
        *,
        population_mean: float,
        population_std: float,
    ) -> None:
        """Initialize the factor.

        Args:
            population_mean: Mean complaint count to compare each context
                against. Should be precomputed across the full set of
                contexts the pipeline will see.
            population_std: Population standard deviation of complaint
                counts. A value of ``0`` causes :meth:`compute` to return
                ``0.0`` for every context (z-score is undefined).
        """
        self._mean = population_mean
        self._std = population_std

    def compute(self, context: FactorContext) -> float:
        """Return the z-score of this context's complaint volume.

        Returns:
            ``(count - population_mean) / population_std``, or ``0.0``
            when ``population_std`` is zero.
        """
        if self._std == 0:
            return 0.0
        return (len(context.complaints) - self._mean) / self._std

name `class-attribute` `instance-attribute` ¶

name = 'anomaly_score'

dtype `class-attribute` `instance-attribute` ¶

dtype = 'float'

compute ¶

compute(context: FactorContext) -> float

Return the z-score of this context's complaint volume.

Returns:

Type	Description
`float`	`(count - population_mean) / population_std`, or `0.0`
`float`	when `population_std` is zero.

Source code in src/nyc311/factors/_builtin.py

def compute(self, context: FactorContext) -> float:
    """Return the z-score of this context's complaint volume.

    Returns:
        ``(count - population_mean) / population_std``, or ``0.0``
        when ``population_std`` is zero.
    """
    if self._std == 0:
        return 0.0
    return (len(context.complaints) - self._mean) / self._std

ComplaintVolumeFactor ¶

Bases: Factor

Total complaint count, optionally per-capita per 10 000 residents.

When per_capita is True and :attr:FactorContext.total_population is available, the result is count / population * 10_000 (a float). Otherwise the raw integer count is returned.

Source code in src/nyc311/factors/_builtin.py

class ComplaintVolumeFactor(Factor):
    """Total complaint count, optionally per-capita per 10 000 residents.

    When *per_capita* is ``True`` and :attr:`FactorContext.total_population`
    is available, the result is ``count / population * 10_000`` (a float).
    Otherwise the raw integer count is returned.
    """

    dtype = "int"

    def __init__(self, *, per_capita: bool = False) -> None:
        """Initialize the factor.

        Args:
            per_capita: If ``True``, normalize by
                :attr:`FactorContext.total_population` and emit a
                ``complaint_rate_per_10k`` float. Otherwise emit the raw
                ``complaint_volume`` integer count.
        """
        self._per_capita = per_capita
        self.name = "complaint_rate_per_10k" if per_capita else "complaint_volume"
        if per_capita:
            self.dtype = "float"  # type: ignore[assignment]

    def compute(self, context: FactorContext) -> int | float:
        """Return the complaint volume (or per-capita rate) for ``context``.

        Returns:
            The integer count of complaints in the context, or, when
            ``per_capita`` is enabled and population is available, the
            float ``count / population * 10_000``.
        """
        count = len(context.complaints)
        if (
            self._per_capita
            and context.total_population
            and context.total_population > 0
        ):
            return count / context.total_population * 10_000
        return count

dtype `class-attribute` `instance-attribute` ¶

dtype = 'int'

name `instance-attribute` ¶

name = (
    "complaint_rate_per_10k"
    if per_capita
    else "complaint_volume"
)

compute ¶

compute(context: FactorContext) -> int | float

Return the complaint volume (or per-capita rate) for context.

Returns:

Type	Description
`int \| float`	The integer count of complaints in the context, or, when
`int \| float`	`per_capita` is enabled and population is available, the
`int \| float`	float `count / population * 10_000`.

Source code in src/nyc311/factors/_builtin.py

def compute(self, context: FactorContext) -> int | float:
    """Return the complaint volume (or per-capita rate) for ``context``.

    Returns:
        The integer count of complaints in the context, or, when
        ``per_capita`` is enabled and population is available, the
        float ``count / population * 10_000``.
    """
    count = len(context.complaints)
    if (
        self._per_capita
        and context.total_population
        and context.total_population > 0
    ):
        return count / context.total_population * 10_000
    return count

RecurrenceFactor ¶

Bases: Factor

Fraction of complaints at locations that appear more than once.

Locations are identified by rounding latitude/longitude to 4 decimal places (~11 m precision). Returns 0.0 when no complaints have coordinates.

Source code in src/nyc311/factors/_builtin.py

class RecurrenceFactor(Factor):
    """Fraction of complaints at locations that appear more than once.

    Locations are identified by rounding latitude/longitude to 4 decimal
    places (~11 m precision).  Returns ``0.0`` when no complaints have
    coordinates.
    """

    name = "recurrence_rate"
    dtype = "float"

    def compute(self, context: FactorContext) -> float:
        """Return the recurrent-location share for ``context``.

        Returns:
            The fraction of geocoded complaint locations (latitude and
            longitude rounded to 4 decimal places) that appear more than
            once in the context. Returns ``0.0`` when no complaints have
            coordinates.
        """
        geo_complaints = [
            c
            for c in context.complaints
            if c.latitude is not None and c.longitude is not None
        ]
        if not geo_complaints:
            return 0.0

        location_counts = Counter(
            (round(c.latitude, 4), round(c.longitude, 4))  # type: ignore[arg-type]
            for c in geo_complaints
        )
        recurrent = sum(1 for c in location_counts.values() if c > 1)
        return recurrent / len(location_counts) if location_counts else 0.0

name `class-attribute` `instance-attribute` ¶

name = 'recurrence_rate'

dtype `class-attribute` `instance-attribute` ¶

dtype = 'float'

compute ¶

compute(context: FactorContext) -> float

Return the recurrent-location share for context.

Returns:

Type	Description
`float`	The fraction of geocoded complaint locations (latitude and
`float`	longitude rounded to 4 decimal places) that appear more than
`float`	once in the context. Returns `0.0` when no complaints have
`float`	coordinates.

Source code in src/nyc311/factors/_builtin.py

def compute(self, context: FactorContext) -> float:
    """Return the recurrent-location share for ``context``.

    Returns:
        The fraction of geocoded complaint locations (latitude and
        longitude rounded to 4 decimal places) that appear more than
        once in the context. Returns ``0.0`` when no complaints have
        coordinates.
    """
    geo_complaints = [
        c
        for c in context.complaints
        if c.latitude is not None and c.longitude is not None
    ]
    if not geo_complaints:
        return 0.0

    location_counts = Counter(
        (round(c.latitude, 4), round(c.longitude, 4))  # type: ignore[arg-type]
        for c in geo_complaints
    )
    recurrent = sum(1 for c in location_counts.values() if c > 1)
    return recurrent / len(location_counts) if location_counts else 0.0

ResolutionTimeFactor ¶

Bases: Factor

Median or mean days between complaint creation and resolution.

Uses resolution_description is not None as a proxy for resolved. Returns -1.0 when no resolved complaints exist in the context.

Source code in src/nyc311/factors/_builtin.py

class ResolutionTimeFactor(Factor):
    """Median or mean days between complaint creation and resolution.

    Uses ``resolution_description is not None`` as a proxy for resolved.
    Returns ``-1.0`` when no resolved complaints exist in the context.
    """

    name = "resolution_time_days"
    dtype = "float"

    def __init__(self, *, method: str = "median") -> None:
        """Initialize the factor.

        Args:
            method: Aggregation strategy across resolved complaints; one
                of ``"median"`` (default) or ``"mean"``.

        Raises:
            ValueError: If ``method`` is not ``"median"`` or ``"mean"``.
        """
        if method not in ("median", "mean"):
            msg = f"method must be 'median' or 'mean', got {method!r}"
            raise ValueError(msg)
        self._method = method

    def compute(self, context: FactorContext) -> float:
        """Return the median (or mean) resolution time for ``context``.

        Returns:
            The number of days between complaint creation and the
            window's end across resolved complaints, aggregated by the
            configured ``method``. Returns ``-1.0`` when no complaints in
            the context have a resolution description.
        """
        resolved = [
            c for c in context.complaints if c.resolution_description is not None
        ]
        if not resolved:
            return -1.0

        days: list[float] = []
        for c in resolved:
            delta = context.time_window_end - c.created_date
            days.append(max(float(delta.days), 0.0))

        if not days:
            return -1.0
        return median(days) if self._method == "median" else mean(days)

name `class-attribute` `instance-attribute` ¶

name = 'resolution_time_days'

dtype `class-attribute` `instance-attribute` ¶

dtype = 'float'

compute ¶

compute(context: FactorContext) -> float

Return the median (or mean) resolution time for context.

Returns:

Type	Description
`float`	The number of days between complaint creation and the
`float`	window's end across resolved complaints, aggregated by the
`float`	configured `method`. Returns `-1.0` when no complaints in
`float`	the context have a resolution description.

Source code in src/nyc311/factors/_builtin.py

def compute(self, context: FactorContext) -> float:
    """Return the median (or mean) resolution time for ``context``.

    Returns:
        The number of days between complaint creation and the
        window's end across resolved complaints, aggregated by the
        configured ``method``. Returns ``-1.0`` when no complaints in
        the context have a resolution description.
    """
    resolved = [
        c for c in context.complaints if c.resolution_description is not None
    ]
    if not resolved:
        return -1.0

    days: list[float] = []
    for c in resolved:
        delta = context.time_window_end - c.created_date
        days.append(max(float(delta.days), 0.0))

    if not days:
        return -1.0
    return median(days) if self._method == "median" else mean(days)

ResponseRateFactor ¶

Bases: Factor

Fraction of complaints that received a resolution description.

Range [0.0, 1.0]. Returns 0.0 for empty contexts.

Source code in src/nyc311/factors/_builtin.py

class ResponseRateFactor(Factor):
    """Fraction of complaints that received a resolution description.

    Range [0.0, 1.0].  Returns ``0.0`` for empty contexts.
    """

    name = "response_rate"
    dtype = "float"

    def compute(self, context: FactorContext) -> float:
        """Return the resolved fraction of complaints in ``context``.

        Returns:
            The fraction of complaints with a non-null
            ``resolution_description``, in ``[0.0, 1.0]``. Returns
            ``0.0`` for empty contexts.
        """
        if not context.complaints:
            return 0.0
        resolved = sum(
            1 for c in context.complaints if c.resolution_description is not None
        )
        return resolved / len(context.complaints)

name `class-attribute` `instance-attribute` ¶

name = 'response_rate'

dtype `class-attribute` `instance-attribute` ¶

dtype = 'float'

compute ¶

compute(context: FactorContext) -> float

Return the resolved fraction of complaints in context.

Returns:

Type	Description
`float`	The fraction of complaints with a non-null
`float`	`resolution_description`, in `[0.0, 1.0]`. Returns
`float`	`0.0` for empty contexts.

Source code in src/nyc311/factors/_builtin.py

def compute(self, context: FactorContext) -> float:
    """Return the resolved fraction of complaints in ``context``.

    Returns:
        The fraction of complaints with a non-null
        ``resolution_description``, in ``[0.0, 1.0]``. Returns
        ``0.0`` for empty contexts.
    """
    if not context.complaints:
        return 0.0
    resolved = sum(
        1 for c in context.complaints if c.resolution_description is not None
    )
    return resolved / len(context.complaints)

SeasonalityFactor ¶

Bases: Factor

Deviation of complaint count from a seasonal baseline.

baseline_monthly_counts maps month number (1-12) to the expected count for that month. The factor returns (actual - expected) / expected as a fractional deviation. Returns 0.0 when the baseline is missing for the context's month or is zero.

Source code in src/nyc311/factors/_builtin.py

class SeasonalityFactor(Factor):
    """Deviation of complaint count from a seasonal baseline.

    *baseline_monthly_counts* maps month number (1-12) to the expected
    count for that month.  The factor returns ``(actual - expected) /
    expected`` as a fractional deviation.  Returns ``0.0`` when the
    baseline is missing for the context's month or is zero.
    """

    name = "seasonality_deviation"
    dtype = "float"

    def __init__(self, baseline_monthly_counts: dict[int, float]) -> None:
        """Initialize the factor.

        Args:
            baseline_monthly_counts: Mapping from month number (``1``
                through ``12``) to the expected complaint count for that
                month. Months not present in the mapping are treated as
                having no baseline.
        """
        self._baseline = baseline_monthly_counts

    def compute(self, context: FactorContext) -> float:
        """Return the fractional deviation from the seasonal baseline.

        Returns:
            ``(actual - expected) / expected`` where ``actual`` is the
            number of complaints in the context and ``expected`` is the
            baseline for the context's start-month. Returns ``0.0`` when
            the baseline is missing or non-positive for that month.
        """
        month = context.time_window_start.month
        expected = self._baseline.get(month, 0.0)
        if expected <= 0:
            return 0.0
        actual = len(context.complaints)
        return (actual - expected) / expected

name `class-attribute` `instance-attribute` ¶

name = 'seasonality_deviation'

dtype `class-attribute` `instance-attribute` ¶

dtype = 'float'

compute ¶

compute(context: FactorContext) -> float

Return the fractional deviation from the seasonal baseline.

Returns:

Type	Description
`float`	`(actual - expected) / expected` where `actual` is the
`float`	number of complaints in the context and `expected` is the
`float`	baseline for the context's start-month. Returns `0.0` when
`float`	the baseline is missing or non-positive for that month.

Source code in src/nyc311/factors/_builtin.py

def compute(self, context: FactorContext) -> float:
    """Return the fractional deviation from the seasonal baseline.

    Returns:
        ``(actual - expected) / expected`` where ``actual`` is the
        number of complaints in the context and ``expected`` is the
        baseline for the context's start-month. Returns ``0.0`` when
        the baseline is missing or non-positive for that month.
    """
    month = context.time_window_start.month
    expected = self._baseline.get(month, 0.0)
    if expected <= 0:
        return 0.0
    actual = len(context.complaints)
    return (actual - expected) / expected

TopicConcentrationFactor ¶

Bases: Factor

Herfindahl-Hirschman Index of complaint-type shares.

HHI = sum(share_i^2) where share_i is the proportion of complaints of type i. Range [1/N, 1.0]; higher values indicate more concentration in fewer complaint types.

Returns 0.0 when the context has no complaints.

Source code in src/nyc311/factors/_builtin.py

class TopicConcentrationFactor(Factor):
    """Herfindahl-Hirschman Index of complaint-type shares.

    HHI = sum(share_i^2) where share_i is the proportion of complaints of
    type *i*.  Range [1/N, 1.0]; higher values indicate more concentration
    in fewer complaint types.

    Returns ``0.0`` when the context has no complaints.
    """

    name = "topic_concentration"
    dtype = "float"

    def compute(self, context: FactorContext) -> float:
        """Return the HHI of complaint-type shares for ``context``.

        Returns:
            ``sum(share_i ** 2)`` where each ``share_i`` is the proportion
            of complaints of type ``i``. The value lies in ``[1/N, 1.0]``
            and increases as complaints concentrate in fewer types.
            Returns ``0.0`` when the context has no complaints.
        """
        if not context.complaints:
            return 0.0
        counts = Counter(c.complaint_type for c in context.complaints)
        total = len(context.complaints)
        return sum((count / total) ** 2 for count in counts.values())

name `class-attribute` `instance-attribute` ¶

name = 'topic_concentration'

dtype `class-attribute` `instance-attribute` ¶

dtype = 'float'

compute ¶

compute(context: FactorContext) -> float

Return the HHI of complaint-type shares for context.

Returns:

Type	Description
`float`	`sum(share_i ** 2)` where each `share_i` is the proportion
`float`	of complaints of type `i`. The value lies in `[1/N, 1.0]`
`float`	and increases as complaints concentrate in fewer types.
`float`	Returns `0.0` when the context has no complaints.

Source code in src/nyc311/factors/_builtin.py

def compute(self, context: FactorContext) -> float:
    """Return the HHI of complaint-type shares for ``context``.

    Returns:
        ``sum(share_i ** 2)`` where each ``share_i`` is the proportion
        of complaints of type ``i``. The value lies in ``[1/N, 1.0]``
        and increases as complaints concentrate in fewer types.
        Returns ``0.0`` when the context has no complaints.
    """
    if not context.complaints:
        return 0.0
    counts = Counter(c.complaint_type for c in context.complaints)
    total = len(context.complaints)
    return sum((count / total) ** 2 for count in counts.values())

dispatch_factor_factory_engine ¶

dispatch_factor_factory_engine(
    panel: Panel,
    *,
    family: str = "did",
    method: str = "twfe",
    outcome: str | None = None,
    **engine_kwargs: Any,
) -> Any

Call factor_factory.engines.<family>.estimate on panel.

This is the chaining target behind :meth:nyc311.factors.Pipeline.as_factor_factory_estimate. It lazily imports the requested engine family so callers don't pay the import cost for families they don't use, and it raises a friendly :class:ImportError when the family's optional dependencies are missing.

Parameters:

Name	Type	Description	Default
`panel`	`Panel`	A :class:`factor_factory.tidy.Panel`. Typically produced by :meth:`nyc311.temporal.PanelDataset.to_factor_factory_panel`.	required
`family`	`str`	Engine-family module name under `factor_factory.engines`. One of :data:`_SUPPORTED_FAMILIES`.	`'did'`
`method`	`str`	Registry key for a specific adapter inside the family. For example, `"twfe"` / `"cs"` / `"sa"` / `"bjs"` for `family="did"`.	`'twfe'`
`outcome`	`str \| None`	Outcome column on the Panel. When `None`, the engine falls back to `panel.outcome_col` (the primary outcome declared in :class:`PanelMetadata`).	`None`
`**engine_kwargs`	`Any`	Additional keyword arguments forwarded to the engine's `estimate` dispatcher.	`{}`

Returns:

Type	Description
`Any`	The factor-factory `<Family>Results` object the engine
`Any`	returned. Its :meth:`summary_table` method produces a
`Any`	`pandas.DataFrame` summary.

Raises:

Type	Description
`ValueError`	If `family` is not in :data:`_SUPPORTED_FAMILIES`.
`ImportError`	If factor-factory is not installed or the requested engine family's optional dependencies are missing.

Source code in src/nyc311/factors/_factor_factory.py

def dispatch_factor_factory_engine(
    panel: ff_tidy.Panel,
    *,
    family: str = "did",
    method: str = "twfe",
    outcome: str | None = None,
    **engine_kwargs: Any,
) -> Any:
    """Call ``factor_factory.engines.<family>.estimate`` on ``panel``.

    This is the chaining target behind
    :meth:`nyc311.factors.Pipeline.as_factor_factory_estimate`. It
    lazily imports the requested engine family so callers don't pay the
    import cost for families they don't use, and it raises a friendly
    :class:`ImportError` when the family's optional dependencies are
    missing.

    Args:
        panel: A :class:`factor_factory.tidy.Panel`. Typically produced
            by :meth:`nyc311.temporal.PanelDataset.to_factor_factory_panel`.
        family: Engine-family module name under
            ``factor_factory.engines``. One of :data:`_SUPPORTED_FAMILIES`.
        method: Registry key for a specific adapter inside the family.
            For example, ``"twfe"`` / ``"cs"`` / ``"sa"`` / ``"bjs"`` for
            ``family="did"``.
        outcome: Outcome column on the Panel. When ``None``, the engine
            falls back to ``panel.outcome_col`` (the primary outcome
            declared in :class:`PanelMetadata`).
        **engine_kwargs: Additional keyword arguments forwarded to the
            engine's ``estimate`` dispatcher.

    Returns:
        The factor-factory ``<Family>Results`` object the engine
        returned. Its :meth:`summary_table` method produces a
        ``pandas.DataFrame`` summary.

    Raises:
        ValueError: If ``family`` is not in :data:`_SUPPORTED_FAMILIES`.
        ImportError: If factor-factory is not installed or the requested
            engine family's optional dependencies are missing.
    """
    if family not in _SUPPORTED_FAMILIES:
        message = (
            f"Unknown factor-factory engine family {family!r}. "
            f"Supported: {_SUPPORTED_FAMILIES}"
        )
        raise ValueError(message)

    module_name = f"factor_factory.engines.{family}"
    try:
        module = importlib.import_module(module_name)
    except ImportError as exc:
        message = (
            f"Could not import {module_name}. Install factor-factory "
            f"with: pip install nyc311 (or pip install factor-factory)."
        )
        raise ImportError(message) from exc

    estimate = module.estimate
    return estimate(
        panel,
        methods=(method,),
        outcome=outcome,
        **engine_kwargs,
    )

Temporal¶

nyc311.temporal ¶

Temporal panel module for longitudinal 311 complaint analysis.

PanelDataset `dataclass` ¶

Balanced panel of (geographic_unit x time_period) observations.

Methods return new :class:PanelDataset instances—the dataset is never mutated in place.

Source code in src/nyc311/temporal/_models.py

@dataclass(frozen=True, slots=True)
class PanelDataset:
    """Balanced panel of (geographic_unit x time_period) observations.

    Methods return **new** :class:`PanelDataset` instances—the dataset is
    never mutated in place.
    """

    observations: tuple[PanelObservation, ...]
    unit_type: str
    periods: tuple[str, ...]
    treatment_events: tuple[TreatmentEvent, ...] = ()

    # ------------------------------------------------------------------
    # Filtering helpers
    # ------------------------------------------------------------------

    def treatment_group(self) -> PanelDataset:
        """Return only observations in units that were ever treated.

        Returns:
            A new :class:`PanelDataset` whose ``observations`` are
            restricted to units with a non-null ``treatment_date``. The
            ``periods`` and ``treatment_events`` fields are preserved.
        """
        treated_ids = {
            obs.unit_id for obs in self.observations if obs.treatment_date is not None
        }
        return PanelDataset(
            observations=tuple(
                o for o in self.observations if o.unit_id in treated_ids
            ),
            unit_type=self.unit_type,
            periods=self.periods,
            treatment_events=self.treatment_events,
        )

    def control_group(self) -> PanelDataset:
        """Return only observations in units that were never treated.

        Returns:
            A new :class:`PanelDataset` whose ``observations`` are
            restricted to units with no ``treatment_date``. The
            ``periods`` and ``treatment_events`` fields are preserved.
        """
        treated_ids = {
            obs.unit_id for obs in self.observations if obs.treatment_date is not None
        }
        return PanelDataset(
            observations=tuple(
                o for o in self.observations if o.unit_id not in treated_ids
            ),
            unit_type=self.unit_type,
            periods=self.periods,
            treatment_events=self.treatment_events,
        )

    def filter_periods(self, start: str, end: str) -> PanelDataset:
        """Restrict the dataset to a closed interval of periods.

        Args:
            start: Inclusive lower-bound period label.
            end: Inclusive upper-bound period label.

        Returns:
            A new :class:`PanelDataset` whose ``observations`` and
            ``periods`` are limited to labels ``p`` satisfying
            ``start <= p <= end``.
        """
        filtered_periods = tuple(p for p in self.periods if start <= p <= end)
        return PanelDataset(
            observations=tuple(
                o for o in self.observations if start <= o.period <= end
            ),
            unit_type=self.unit_type,
            periods=filtered_periods,
            treatment_events=self.treatment_events,
        )

    # ------------------------------------------------------------------
    # Properties
    # ------------------------------------------------------------------

    @property
    def unit_ids(self) -> tuple[str, ...]:
        """The sorted, unique unit identifiers in the dataset.

        Returns:
            A tuple of distinct ``unit_id`` values from
            ``observations``, in lexicographic order.
        """
        return tuple(sorted({obs.unit_id for obs in self.observations}))

    # ------------------------------------------------------------------
    # Export
    # ------------------------------------------------------------------

    def to_factor_factory_panel(
        self,
        *,
        outcome_col: str = "complaint_count",
        provenance: Any | None = None,
        spatial_weights: dict[str, dict[str, float]] | None = None,
    ) -> Any:
        """Convert to a :class:`factor_factory.tidy.Panel`.

        The adapter is additive — ``self`` is unchanged. Treatment events
        are translated to factor-factory's frozen
        :class:`TreatmentEvent` model, and an optional
        ``spatial_weights`` dict (as produced by
        :func:`nyc311.temporal.build_distance_weights`) is stashed on
        ``panel.df.attrs["nyc311_spatial_weights"]`` for in-memory
        round-trip.

        See :mod:`nyc311.temporal._factor_factory` for details on the
        column crosswalk.

        Args:
            outcome_col: Column name to tag as the primary outcome in
                the Panel metadata. Defaults to ``"complaint_count"``.
            provenance: Optional ``factor_factory.tidy.Provenance``
                record. When ``None``, a default pointing at the NYC
                Open Data Socrata endpoint is constructed.
            spatial_weights: Optional nested weights dict from
                :func:`build_distance_weights`.

        Returns:
            A fully-validated ``factor_factory.tidy.Panel``.

        Raises:
            ImportError: If factor-factory or pandas is not installed.
            ValueError: If the dataset is empty or ``outcome_col`` is
                absent from the resulting DataFrame.
        """
        from nyc311.temporal._factor_factory import (
            panel_dataset_to_factor_factory,
        )

        return panel_dataset_to_factor_factory(
            self,
            outcome_col=outcome_col,
            provenance=provenance,
            spatial_weights=spatial_weights,
        )

    def to_dataframe(self) -> Any:
        """Convert to a pandas DataFrame with a ``(unit_id, period)`` MultiIndex.

        Each per-type complaint count is exploded into a
        ``complaints_<type>`` column, and any per-unit covariates are
        merged in as additional columns.

        Returns:
            A ``pandas.DataFrame`` indexed by ``(unit_id, period)`` with
            one column per panel measure. The frame has no rows when the
            dataset is empty.

        Raises:
            ImportError: If pandas is not installed. Install the optional
                dataframes extra with ``pip install nyc311[dataframes]``.
        """
        try:
            import pandas as pd
        except ImportError as exc:
            message = (
                "pandas is required for to_dataframe(). "
                "Install it with: pip install nyc311[dataframes]"
            )
            raise ImportError(message) from exc

        rows: list[dict[str, Any]] = []
        for obs in self.observations:
            row: dict[str, Any] = {
                "unit_id": obs.unit_id,
                "period": obs.period,
                "complaint_count": obs.complaint_count,
                "resolution_rate": obs.resolution_rate,
                "median_resolution_days": obs.median_resolution_days,
                "treatment": obs.treatment,
                "population": obs.population,
            }
            for ctype, cnt in obs.complaint_counts_by_type.items():
                row[f"complaints_{ctype}"] = cnt
            if obs.covariates:
                row.update(obs.covariates)
            rows.append(row)

        df = pd.DataFrame(rows)
        if not df.empty:
            df = df.set_index(["unit_id", "period"])
        return df

observations `instance-attribute` ¶

observations: tuple[PanelObservation, ...]

unit_type `instance-attribute` ¶

unit_type: str

periods `instance-attribute` ¶

periods: tuple[str, ...]

treatment_events `class-attribute` `instance-attribute` ¶

treatment_events: tuple[TreatmentEvent, ...] = ()

unit_ids `property` ¶

unit_ids: tuple[str, ...]

The sorted, unique unit identifiers in the dataset.

Returns:

Type	Description
`str`	A tuple of distinct `unit_id` values from
`...`	`observations`, in lexicographic order.

treatment_group ¶

treatment_group() -> PanelDataset

Return only observations in units that were ever treated.

Returns:

Type	Description
`PanelDataset`	A new :class:`PanelDataset` whose `observations` are
`PanelDataset`	restricted to units with a non-null `treatment_date`. The
`PanelDataset`	`periods` and `treatment_events` fields are preserved.

Source code in src/nyc311/temporal/_models.py

def treatment_group(self) -> PanelDataset:
    """Return only observations in units that were ever treated.

    Returns:
        A new :class:`PanelDataset` whose ``observations`` are
        restricted to units with a non-null ``treatment_date``. The
        ``periods`` and ``treatment_events`` fields are preserved.
    """
    treated_ids = {
        obs.unit_id for obs in self.observations if obs.treatment_date is not None
    }
    return PanelDataset(
        observations=tuple(
            o for o in self.observations if o.unit_id in treated_ids
        ),
        unit_type=self.unit_type,
        periods=self.periods,
        treatment_events=self.treatment_events,
    )

control_group ¶

control_group() -> PanelDataset

Return only observations in units that were never treated.

Returns:

Type	Description
`PanelDataset`	A new :class:`PanelDataset` whose `observations` are
`PanelDataset`	restricted to units with no `treatment_date`. The
`PanelDataset`	`periods` and `treatment_events` fields are preserved.

Source code in src/nyc311/temporal/_models.py

def control_group(self) -> PanelDataset:
    """Return only observations in units that were never treated.

    Returns:
        A new :class:`PanelDataset` whose ``observations`` are
        restricted to units with no ``treatment_date``. The
        ``periods`` and ``treatment_events`` fields are preserved.
    """
    treated_ids = {
        obs.unit_id for obs in self.observations if obs.treatment_date is not None
    }
    return PanelDataset(
        observations=tuple(
            o for o in self.observations if o.unit_id not in treated_ids
        ),
        unit_type=self.unit_type,
        periods=self.periods,
        treatment_events=self.treatment_events,
    )

filter_periods ¶

filter_periods(start: str, end: str) -> PanelDataset

Restrict the dataset to a closed interval of periods.

Parameters:

Name	Type	Description	Default
`start`	`str`	Inclusive lower-bound period label.	required
`end`	`str`	Inclusive upper-bound period label.	required

Returns:

Type	Description
`PanelDataset`	A new :class:`PanelDataset` whose `observations` and
`PanelDataset`	`periods` are limited to labels `p` satisfying
`PanelDataset`	`start <= p <= end`.

Source code in src/nyc311/temporal/_models.py

def filter_periods(self, start: str, end: str) -> PanelDataset:
    """Restrict the dataset to a closed interval of periods.

    Args:
        start: Inclusive lower-bound period label.
        end: Inclusive upper-bound period label.

    Returns:
        A new :class:`PanelDataset` whose ``observations`` and
        ``periods`` are limited to labels ``p`` satisfying
        ``start <= p <= end``.
    """
    filtered_periods = tuple(p for p in self.periods if start <= p <= end)
    return PanelDataset(
        observations=tuple(
            o for o in self.observations if start <= o.period <= end
        ),
        unit_type=self.unit_type,
        periods=filtered_periods,
        treatment_events=self.treatment_events,
    )

to_factor_factory_panel ¶

to_factor_factory_panel(
    *,
    outcome_col: str = "complaint_count",
    provenance: Any | None = None,
    spatial_weights: dict[str, dict[str, float]]
    | None = None,
) -> Any

Convert to a :class:factor_factory.tidy.Panel.

The adapter is additive — self is unchanged. Treatment events are translated to factor-factory's frozen :class:TreatmentEvent model, and an optional spatial_weights dict (as produced by :func:nyc311.temporal.build_distance_weights) is stashed on panel.df.attrs["nyc311_spatial_weights"] for in-memory round-trip.

See :mod:nyc311.temporal._factor_factory for details on the column crosswalk.

Parameters:

Name	Type	Description	Default
`outcome_col`	`str`	Column name to tag as the primary outcome in the Panel metadata. Defaults to `"complaint_count"`.	`'complaint_count'`
`provenance`	`Any \| None`	Optional `factor_factory.tidy.Provenance` record. When `None`, a default pointing at the NYC Open Data Socrata endpoint is constructed.	`None`
`spatial_weights`	`dict[str, dict[str, float]] \| None`	Optional nested weights dict from :func:`build_distance_weights`.	`None`

Returns:

Type	Description
`Any`	A fully-validated `factor_factory.tidy.Panel`.

Raises:

Type	Description
`ImportError`	If factor-factory or pandas is not installed.
`ValueError`	If the dataset is empty or `outcome_col` is absent from the resulting DataFrame.

Source code in src/nyc311/temporal/_models.py

def to_factor_factory_panel(
    self,
    *,
    outcome_col: str = "complaint_count",
    provenance: Any | None = None,
    spatial_weights: dict[str, dict[str, float]] | None = None,
) -> Any:
    """Convert to a :class:`factor_factory.tidy.Panel`.

    The adapter is additive — ``self`` is unchanged. Treatment events
    are translated to factor-factory's frozen
    :class:`TreatmentEvent` model, and an optional
    ``spatial_weights`` dict (as produced by
    :func:`nyc311.temporal.build_distance_weights`) is stashed on
    ``panel.df.attrs["nyc311_spatial_weights"]`` for in-memory
    round-trip.

    See :mod:`nyc311.temporal._factor_factory` for details on the
    column crosswalk.

    Args:
        outcome_col: Column name to tag as the primary outcome in
            the Panel metadata. Defaults to ``"complaint_count"``.
        provenance: Optional ``factor_factory.tidy.Provenance``
            record. When ``None``, a default pointing at the NYC
            Open Data Socrata endpoint is constructed.
        spatial_weights: Optional nested weights dict from
            :func:`build_distance_weights`.

    Returns:
        A fully-validated ``factor_factory.tidy.Panel``.

    Raises:
        ImportError: If factor-factory or pandas is not installed.
        ValueError: If the dataset is empty or ``outcome_col`` is
            absent from the resulting DataFrame.
    """
    from nyc311.temporal._factor_factory import (
        panel_dataset_to_factor_factory,
    )

    return panel_dataset_to_factor_factory(
        self,
        outcome_col=outcome_col,
        provenance=provenance,
        spatial_weights=spatial_weights,
    )

to_dataframe ¶

to_dataframe() -> Any

Convert to a pandas DataFrame with a (unit_id, period) MultiIndex.

Each per-type complaint count is exploded into a complaints_<type> column, and any per-unit covariates are merged in as additional columns.

Returns:

Type	Description
`Any`	A `pandas.DataFrame` indexed by `(unit_id, period)` with
`Any`	one column per panel measure. The frame has no rows when the
`Any`	dataset is empty.

Raises:

Type	Description
`ImportError`	If pandas is not installed. Install the optional dataframes extra with `pip install nyc311[dataframes]`.

Source code in src/nyc311/temporal/_models.py

def to_dataframe(self) -> Any:
    """Convert to a pandas DataFrame with a ``(unit_id, period)`` MultiIndex.

    Each per-type complaint count is exploded into a
    ``complaints_<type>`` column, and any per-unit covariates are
    merged in as additional columns.

    Returns:
        A ``pandas.DataFrame`` indexed by ``(unit_id, period)`` with
        one column per panel measure. The frame has no rows when the
        dataset is empty.

    Raises:
        ImportError: If pandas is not installed. Install the optional
            dataframes extra with ``pip install nyc311[dataframes]``.
    """
    try:
        import pandas as pd
    except ImportError as exc:
        message = (
            "pandas is required for to_dataframe(). "
            "Install it with: pip install nyc311[dataframes]"
        )
        raise ImportError(message) from exc

    rows: list[dict[str, Any]] = []
    for obs in self.observations:
        row: dict[str, Any] = {
            "unit_id": obs.unit_id,
            "period": obs.period,
            "complaint_count": obs.complaint_count,
            "resolution_rate": obs.resolution_rate,
            "median_resolution_days": obs.median_resolution_days,
            "treatment": obs.treatment,
            "population": obs.population,
        }
        for ctype, cnt in obs.complaint_counts_by_type.items():
            row[f"complaints_{ctype}"] = cnt
        if obs.covariates:
            row.update(obs.covariates)
        rows.append(row)

    df = pd.DataFrame(rows)
    if not df.empty:
        df = df.set_index(["unit_id", "period"])
    return df

PanelObservation `dataclass` ¶

One row in a balanced panel: (geographic_unit x time_period).

Source code in src/nyc311/temporal/_models.py

@dataclass(frozen=True, slots=True)
class PanelObservation:
    """One row in a balanced panel: (geographic_unit x time_period)."""

    #: Stable identifier of the geographic unit (community district code,
    #: NTA code, borough name, etc.).
    unit_id: str
    #: Period label (for example ``"2024-03"`` for monthly panels).
    period: str
    #: Total number of complaints in this unit/period cell.
    complaint_count: int
    #: Per-complaint-type counts within this cell.
    complaint_counts_by_type: dict[str, int]
    #: Fraction of complaints with a non-null ``resolution_description``.
    resolution_rate: float
    #: Median days from creation to period-end across resolved complaints,
    #: or ``None`` when no complaint in the cell was resolved.
    median_resolution_days: float | None
    #: ``True`` once the unit has been exposed to a treatment event.
    treatment: bool
    #: Date the unit was first treated, or ``None`` if never treated.
    treatment_date: date | None
    #: Total population for the unit, when supplied.
    population: int | None
    #: Optional time-invariant covariates merged in at panel-build time.
    covariates: dict[str, float] | None = None

unit_id `instance-attribute` ¶

unit_id: str

period `instance-attribute` ¶

period: str

complaint_count `instance-attribute` ¶

complaint_count: int

complaint_counts_by_type `instance-attribute` ¶

complaint_counts_by_type: dict[str, int]

resolution_rate `instance-attribute` ¶

resolution_rate: float

median_resolution_days `instance-attribute` ¶

median_resolution_days: float | None

treatment `instance-attribute` ¶

treatment: bool

treatment_date `instance-attribute` ¶

treatment_date: date | None

population `instance-attribute` ¶

population: int | None

covariates `class-attribute` `instance-attribute` ¶

covariates: dict[str, float] | None = None

TreatmentEvent `dataclass` ¶

A policy intervention applied to specific geographic units.

Source code in src/nyc311/temporal/_models.py

@dataclass(frozen=True, slots=True)
class TreatmentEvent:
    """A policy intervention applied to specific geographic units."""

    name: str
    description: str
    treated_units: tuple[str, ...]
    treatment_date: date
    geography: str

    def __post_init__(self) -> None:
        if not self.name.strip():
            raise ValueError("Treatment name must not be empty.")
        if not self.treated_units:
            raise ValueError("treated_units must contain at least one unit.")

name `instance-attribute` ¶

name: str

description `instance-attribute` ¶

description: str

treated_units `instance-attribute` ¶

treated_units: tuple[str, ...]

treatment_date `instance-attribute` ¶

treatment_date: date

geography `instance-attribute` ¶

geography: str

panel_dataset_to_factor_factory ¶

panel_dataset_to_factor_factory(
    dataset: PanelDataset,
    *,
    outcome_col: str = "complaint_count",
    provenance: Provenance | None = None,
    spatial_weights: dict[str, dict[str, float]]
    | None = None,
) -> ff_tidy.Panel

Convert a :class:PanelDataset to a :class:factor_factory.tidy.Panel.

Maps nyc311's panel model onto factor-factory's tidy Panel contract:

unit_id → Panel first-level MultiIndex, named unit_id.
period (string label) → pandas Timestamp at the period start, second-level index named period.
complaint_count → primary outcome column (configurable via outcome_col).
treatment (bool) → int 0/1 column named treatment.
resolution_rate, median_resolution_days, population, per-type complaint counts, and covariates flow through as additional columns the engine can consume as covariates.
TreatmentEvent tuples are translated to factor-factory's frozen :class:TreatmentEvent pydantic model (geography maps to dimension).
A spatial_weights dict (as produced by :func:nyc311.temporal.build_distance_weights) is attached to the resulting :attr:Panel.df.attrs under the key "nyc311_spatial_weights" for in-memory round-trip.

Parameters:

Name	Type	Description	Default
`dataset`	`PanelDataset`	The balanced :class:`PanelDataset` to convert.	required
`outcome_col`	`str`	Column name to tag as the primary outcome in the Panel metadata. Must be one of `"complaint_count"`, `"resolution_rate"`, `"median_resolution_days"`, or a `"complaints_<type>"` column present on the observations.	`'complaint_count'`
`provenance`	`Provenance \| None`	Optional factor-factory :class:`Provenance` record describing the dataset. When `None`, a default is constructed pointing at the NYC Open Data 311 Socrata endpoint.	`None`
`spatial_weights`	`dict[str, dict[str, float]] \| None`	Optional nested dict as produced by :func:`build_distance_weights`. Stashed on `panel.df.attrs["nyc311_spatial_weights"]` so downstream code can pick it up without a second computation.	`None`

Returns:

Type	Description
`Panel`	A fully-validated :class:`factor_factory.tidy.Panel`.

Raises:

Type	Description
`ImportError`	If `factor-factory` or pandas is not installed.
`ValueError`	If `dataset` is empty or `outcome_col` is not present on the first observation.

Source code in src/nyc311/temporal/_factor_factory.py

def panel_dataset_to_factor_factory(
    dataset: PanelDataset,
    *,
    outcome_col: str = "complaint_count",
    provenance: ff_tidy.Provenance | None = None,
    spatial_weights: dict[str, dict[str, float]] | None = None,
) -> ff_tidy.Panel:
    """Convert a :class:`PanelDataset` to a :class:`factor_factory.tidy.Panel`.

    Maps nyc311's panel model onto factor-factory's tidy Panel contract:

    - ``unit_id`` → Panel first-level MultiIndex, named ``unit_id``.
    - ``period`` (string label) → pandas Timestamp at the period start,
      second-level index named ``period``.
    - ``complaint_count`` → primary outcome column (configurable via
      ``outcome_col``).
    - ``treatment`` (bool) → int 0/1 column named ``treatment``.
    - ``resolution_rate``, ``median_resolution_days``, ``population``,
      per-type complaint counts, and covariates flow through as
      additional columns the engine can consume as covariates.
    - ``TreatmentEvent`` tuples are translated to factor-factory's
      frozen :class:`TreatmentEvent` pydantic model (``geography`` maps
      to ``dimension``).
    - A ``spatial_weights`` dict (as produced by
      :func:`nyc311.temporal.build_distance_weights`) is attached to
      the resulting :attr:`Panel.df.attrs` under the key
      ``"nyc311_spatial_weights"`` for in-memory round-trip.

    Args:
        dataset: The balanced :class:`PanelDataset` to convert.
        outcome_col: Column name to tag as the primary outcome in the
            Panel metadata. Must be one of ``"complaint_count"``,
            ``"resolution_rate"``, ``"median_resolution_days"``, or a
            ``"complaints_<type>"`` column present on the observations.
        provenance: Optional factor-factory :class:`Provenance` record
            describing the dataset. When ``None``, a default is
            constructed pointing at the NYC Open Data 311 Socrata
            endpoint.
        spatial_weights: Optional nested dict as produced by
            :func:`build_distance_weights`. Stashed on
            ``panel.df.attrs["nyc311_spatial_weights"]`` so downstream
            code can pick it up without a second computation.

    Returns:
        A fully-validated :class:`factor_factory.tidy.Panel`.

    Raises:
        ImportError: If ``factor-factory`` or pandas is not installed.
        ValueError: If ``dataset`` is empty or ``outcome_col`` is not
            present on the first observation.
    """
    try:
        import pandas as pd
        from factor_factory.tidy import (
            Panel,
            PanelMetadata,
            Provenance,
        )
        from factor_factory.tidy import (
            TreatmentEvent as FFTreatmentEvent,
        )
    except ImportError as exc:
        message = (
            "factor-factory and pandas are required for "
            "PanelDataset.to_factor_factory_panel(). "
            "Install with: pip install nyc311"
        )
        raise ImportError(message) from exc

    if not dataset.observations:
        message = "Cannot convert an empty PanelDataset to a factor-factory Panel."
        raise ValueError(message)

    rows: list[dict[str, Any]] = []
    for obs in dataset.observations:
        row: dict[str, Any] = {
            "unit_id": obs.unit_id,
            "period": _period_to_timestamp(obs.period),
            "complaint_count": obs.complaint_count,
            "resolution_rate": obs.resolution_rate,
            "treatment": int(obs.treatment),
        }
        if obs.median_resolution_days is not None:
            row["median_resolution_days"] = obs.median_resolution_days
        if obs.population is not None:
            row["population"] = obs.population
        for ctype, cnt in obs.complaint_counts_by_type.items():
            row[f"complaints_{ctype}"] = cnt
        if obs.covariates:
            row.update(obs.covariates)
        rows.append(row)

    df = pd.DataFrame(rows).set_index(["unit_id", "period"]).sort_index()

    if outcome_col not in df.columns:
        message = (
            f"outcome_col={outcome_col!r} not in panel columns. "
            f"Available: {sorted(df.columns)}"
        )
        raise ValueError(message)

    ff_events = tuple(
        FFTreatmentEvent(
            name=ev.name,
            description=ev.description,
            treated_units=tuple(ev.treated_units),
            treatment_date=ev.treatment_date,
            dimension=ev.geography,
        )
        for ev in dataset.treatment_events
    )

    if provenance is None:
        provenance = Provenance(
            data_source="NYC Open Data — 311 Service Requests (Socrata erm2-nwe9)",
            license="CC0-1.0",
            creator="nyc311.temporal.PanelDataset",
            citation="https://opendata.cityofnewyork.us/",
        )

    metadata = PanelMetadata(
        outcome_cols=(outcome_col,),
        period_kind="timestamp",
        freq=_infer_freq(dataset.periods),
        dimension=dataset.unit_type,
        treatment_events=ff_events,
        record_count=len(dataset.observations),
        provenance=provenance,
    )

    panel = Panel(df, metadata)

    if spatial_weights is not None:
        panel.df.attrs[_SPATIAL_WEIGHTS_ATTR] = dict(spatial_weights)

    return panel

spatial_weights_from_panel ¶

spatial_weights_from_panel(
    panel: Panel,
) -> dict[str, dict[str, float]] | None

Recover spatial weights previously attached via the adapter.

Parameters:

Name	Type	Description	Default
`panel`	`Panel`	A :class:`factor_factory.tidy.Panel` that was produced by :func:`panel_dataset_to_factor_factory` with `spatial_weights` supplied.	required

Returns:

Type	Description
`dict[str, dict[str, float]] \| None`	The nested weights dict, or `None` if no spatial weights were
`dict[str, dict[str, float]] \| None`	attached.

Source code in src/nyc311/temporal/_factor_factory.py

def spatial_weights_from_panel(
    panel: ff_tidy.Panel,
) -> dict[str, dict[str, float]] | None:
    """Recover spatial weights previously attached via the adapter.

    Args:
        panel: A :class:`factor_factory.tidy.Panel` that was produced by
            :func:`panel_dataset_to_factor_factory` with
            ``spatial_weights`` supplied.

    Returns:
        The nested weights dict, or ``None`` if no spatial weights were
        attached.
    """
    weights = panel.df.attrs.get(_SPATIAL_WEIGHTS_ATTR)
    if weights is None:
        return None
    return dict(weights)

build_complaint_panel ¶

build_complaint_panel(
    records: Sequence[ServiceRequestRecord],
    *,
    geography: str = "community_district",
    freq: str = "ME",
    treatment_events: Sequence[TreatmentEvent] = (),
    population_data: dict[str, int] | None = None,
    covariates: dict[str, dict[str, float]] | None = None,
) -> PanelDataset

Construct a balanced panel from service-request records.

Aggregates records into one observation per (geographic-unit, period) cell, filling missing cells so the resulting :class:PanelDataset is fully balanced across both dimensions.

Parameters:

Name	Type	Description	Default
`records`	`Sequence[ServiceRequestRecord]`	Raw complaint records to aggregate.	required
`geography`	`str`	Geographic unit to group by; one of `"borough"` or `"community_district"`.	`'community_district'`
`freq`	`str`	Pandas offset alias controlling the period length (`"ME"` for monthly, `"QE"` for quarterly, `"YE"` for yearly). Both legacy (`"M"`) and modern (`"ME"`) aliases are accepted.	`'ME'`
`treatment_events`	`Sequence[TreatmentEvent]`	Policy interventions to code as treatment indicators on each observation.	`()`
`population_data`	`dict[str, int] \| None`	Mapping `{unit_id: total_population}` used to populate :attr:`PanelObservation.population` for per-capita downstream analyses.	`None`
`covariates`	`dict[str, dict[str, float]] \| None`	Mapping `{unit_id: {name: value}}` of time-invariant demographic covariates to attach to each observation in a unit.	`None`

Returns:

Name	Type	Description
`A`	`PanelDataset`	class:`PanelDataset` with one observation per ``(unit,
	`PanelDataset`	period)`. When`records`` is empty the returned dataset has no
	`PanelDataset`	observations and no periods.

Raises:

Type	Description
`ImportError`	If pandas is not installed. Install the optional dataframes extra with `pip install nyc311[dataframes]`.

Source code in src/nyc311/temporal/_panel.py

def build_complaint_panel(
    records: Sequence[ServiceRequestRecord],
    *,
    geography: str = "community_district",
    freq: str = "ME",
    treatment_events: Sequence[TreatmentEvent] = (),
    population_data: dict[str, int] | None = None,
    covariates: dict[str, dict[str, float]] | None = None,
) -> PanelDataset:
    """Construct a balanced panel from service-request records.

    Aggregates ``records`` into one observation per
    (geographic-unit, period) cell, filling missing cells so the
    resulting :class:`PanelDataset` is fully balanced across both
    dimensions.

    Args:
        records: Raw complaint records to aggregate.
        geography: Geographic unit to group by; one of ``"borough"`` or
            ``"community_district"``.
        freq: Pandas offset alias controlling the period length
            (``"ME"`` for monthly, ``"QE"`` for quarterly, ``"YE"`` for
            yearly). Both legacy (``"M"``) and modern (``"ME"``) aliases
            are accepted.
        treatment_events: Policy interventions to code as treatment
            indicators on each observation.
        population_data: Mapping ``{unit_id: total_population}`` used to
            populate :attr:`PanelObservation.population` for per-capita
            downstream analyses.
        covariates: Mapping ``{unit_id: {name: value}}`` of
            time-invariant demographic covariates to attach to each
            observation in a unit.

    Returns:
        A :class:`PanelDataset` with one observation per ``(unit,
        period)``. When ``records`` is empty the returned dataset has no
        observations and no periods.

    Raises:
        ImportError: If pandas is not installed. Install the optional
            dataframes extra with ``pip install nyc311[dataframes]``.
    """
    try:
        import pandas as pd
    except ImportError as exc:
        message = (
            "pandas is required for build_complaint_panel(). "
            "Install it with: pip install nyc311[dataframes]"
        )
        raise ImportError(message) from exc

    norm_freq = _normalize_freq(freq)

    # -- group records by (unit, period) ----------------------------------
    grouped: dict[tuple[str, str], list[ServiceRequestRecord]] = defaultdict(list)
    all_units: set[str] = set()

    for rec in records:
        unit = rec.geography_value(geography)
        period = pd.Timestamp(rec.created_date).to_period(norm_freq)
        period_label = str(period)
        all_units.add(unit)
        grouped[(unit, period_label)].append(rec)

    if not all_units:
        return PanelDataset(
            observations=(),
            unit_type=geography,
            periods=(),
            treatment_events=tuple(treatment_events),
        )

    # -- determine ordered period labels ----------------------------------
    all_periods: set[str] = set()
    for _unit, period_label in grouped:
        all_periods.add(period_label)

    ordered_periods = tuple(sorted(all_periods))

    # -- build treatment lookup -------------------------------------------
    treatment_lookup: dict[str, date] = {}
    for event in treatment_events:
        for unit in event.treated_units:
            existing = treatment_lookup.get(unit)
            if existing is None or event.treatment_date < existing:
                treatment_lookup[unit] = event.treatment_date

    # -- build balanced panel ---------------------------------------------
    pops = population_data or {}
    covs = covariates or {}
    observations: list[PanelObservation] = []

    for unit in sorted(all_units):
        unit_treatment_date = treatment_lookup.get(unit)
        for period_label in ordered_periods:
            recs = grouped.get((unit, period_label), [])

            complaint_count = len(recs)
            type_counts: Counter[str] = Counter(r.complaint_type for r in recs)

            resolved = [r for r in recs if r.resolution_description is not None]
            resolution_rate = (
                len(resolved) / complaint_count if complaint_count else 0.0
            )

            if resolved:
                period_obj = pd.Period(period_label, freq=norm_freq)
                period_end = period_obj.end_time.date()
                days_list = [
                    max((period_end - r.created_date).days, 0) for r in resolved
                ]
                med_days: float | None = median(days_list)
            else:
                med_days = None

            is_treated = False
            if unit_treatment_date is not None:
                try:
                    period_obj = pd.Period(period_label, freq=norm_freq)
                    period_start = period_obj.start_time.date()
                    is_treated = period_start >= unit_treatment_date
                except Exception:  # noqa: BLE001
                    pass

            observations.append(
                PanelObservation(
                    unit_id=unit,
                    period=period_label,
                    complaint_count=complaint_count,
                    complaint_counts_by_type=dict(type_counts),
                    resolution_rate=resolution_rate,
                    median_resolution_days=med_days,
                    treatment=is_treated,
                    treatment_date=unit_treatment_date,
                    population=pops.get(unit),
                    covariates=covs.get(unit),
                )
            )

    return PanelDataset(
        observations=tuple(observations),
        unit_type=geography,
        periods=ordered_periods,
        treatment_events=tuple(treatment_events),
    )

build_distance_weights ¶

build_distance_weights(
    unit_centroids: dict[str, tuple[float, float]],
    *,
    threshold_meters: float = 2000.0,
    row_standardize: bool = True,
) -> dict[str, dict[str, float]]

Build an inverse-distance spatial weights matrix.

Units within threshold_meters are neighbors, weighted by 1 / distance. The resulting matrix is symmetric before row-standardization.

Parameters:

Name	Type	Description	Default
`unit_centroids`	`dict[str, tuple[float, float]]`	Mapping `{unit_id: (latitude, longitude)}` of unit centroids in WGS84 degrees.	required
`threshold_meters`	`float`	Maximum great-circle distance, in meters, for two units to be considered neighbors.	`2000.0`
`row_standardize`	`bool`	If `True`, normalize each row of the resulting weights matrix to sum to `1.0`.	`True`

Returns:

Type	Description
`dict[str, dict[str, float]]`	Nested dictionary `{unit_a: {unit_b: weight}}`. Units with no
`dict[str, dict[str, float]]`	neighbors map to an empty inner dict.

Source code in src/nyc311/temporal/_spatial_weights.py

def build_distance_weights(
    unit_centroids: dict[str, tuple[float, float]],
    *,
    threshold_meters: float = 2000.0,
    row_standardize: bool = True,
) -> dict[str, dict[str, float]]:
    """Build an inverse-distance spatial weights matrix.

    Units within ``threshold_meters`` are neighbors, weighted by
    ``1 / distance``. The resulting matrix is symmetric before
    row-standardization.

    Args:
        unit_centroids: Mapping ``{unit_id: (latitude, longitude)}`` of
            unit centroids in WGS84 degrees.
        threshold_meters: Maximum great-circle distance, in meters, for
            two units to be considered neighbors.
        row_standardize: If ``True``, normalize each row of the resulting
            weights matrix to sum to ``1.0``.

    Returns:
        Nested dictionary ``{unit_a: {unit_b: weight}}``. Units with no
        neighbors map to an empty inner dict.
    """
    unit_ids = sorted(unit_centroids)
    raw: dict[str, dict[str, float]] = {uid: {} for uid in unit_ids}

    for i, uid_a in enumerate(unit_ids):
        lat_a, lon_a = unit_centroids[uid_a]
        for uid_b in unit_ids[i + 1 :]:
            lat_b, lon_b = unit_centroids[uid_b]
            dist = haversine_distance_meters(
                latitude_a=lat_a,
                longitude_a=lon_a,
                latitude_b=lat_b,
                longitude_b=lon_b,
            )
            if 0 < dist <= threshold_meters:
                w = 1.0 / dist
                raw[uid_a][uid_b] = w
                raw[uid_b][uid_a] = w

    if row_standardize:
        for uid in unit_ids:
            row_sum = sum(raw[uid].values())
            if row_sum > 0:
                raw[uid] = {nb: w / row_sum for nb, w in raw[uid].items()}

    return raw

centroids_from_boundaries ¶

centroids_from_boundaries(
    boundaries: Any,
) -> dict[str, tuple[float, float]]

Extract centroids from a :class:BoundaryCollection.

Computes a per-feature centroid as the mean of the exterior-ring coordinates. This is approximate but cheap and avoids a hard dependency on shapely.

.. note::

As of nyc-geo-toolkit v0.4.0,
:func:`nyc_geo_toolkit.centroids_from_boundaries` is available
as a shapely-backed, publication-grade centroid helper — it
returns a :class:`BoundaryCollection` of GeoJSON ``Point``
features at either the geometric centroid (default) or
shapely's ``representative_point`` (guaranteed to lie inside
concave polygons such as NYC's jagged community districts).
Prefer it when you already have shapely installed and need
defensible geometry for a published analysis.

nyc311's helper is intentionally the **shapely-free** path
(returns a plain ``dict[str, (lat, lon)]`` suitable for
feeding directly into :func:`build_distance_weights`) and is
preserved for workflows that need to stay on the lean base
install. The two helpers return different shapes and slightly
different numbers; don't swap them mid-analysis.

Parameters:

Name	Type	Description	Default
`boundaries`	`Any`	A boundary collection exposing a `features` iterable. Each feature must provide a `geometry` mapping with `"type"` (`"Polygon"` or `"MultiPolygon"`) and `"coordinates"`, plus a `geography_value` attribute.	required

Returns:

Type	Description
`dict[str, tuple[float, float]]`	Mapping `{geography_value: (latitude, longitude)}` for every
`dict[str, tuple[float, float]]`	feature whose exterior ring is non-empty.

Source code in src/nyc311/temporal/_spatial_weights.py

def centroids_from_boundaries(boundaries: Any) -> dict[str, tuple[float, float]]:
    """Extract centroids from a :class:`BoundaryCollection`.

    Computes a per-feature centroid as the mean of the exterior-ring
    coordinates. This is approximate but cheap and avoids a hard
    dependency on shapely.

    .. note::

        As of nyc-geo-toolkit v0.4.0,
        :func:`nyc_geo_toolkit.centroids_from_boundaries` is available
        as a shapely-backed, publication-grade centroid helper — it
        returns a :class:`BoundaryCollection` of GeoJSON ``Point``
        features at either the geometric centroid (default) or
        shapely's ``representative_point`` (guaranteed to lie inside
        concave polygons such as NYC's jagged community districts).
        Prefer it when you already have shapely installed and need
        defensible geometry for a published analysis.

        nyc311's helper is intentionally the **shapely-free** path
        (returns a plain ``dict[str, (lat, lon)]`` suitable for
        feeding directly into :func:`build_distance_weights`) and is
        preserved for workflows that need to stay on the lean base
        install. The two helpers return different shapes and slightly
        different numbers; don't swap them mid-analysis.

    Args:
        boundaries: A boundary collection exposing a ``features``
            iterable. Each feature must provide a ``geometry`` mapping
            with ``"type"`` (``"Polygon"`` or ``"MultiPolygon"``) and
            ``"coordinates"``, plus a ``geography_value`` attribute.

    Returns:
        Mapping ``{geography_value: (latitude, longitude)}`` for every
        feature whose exterior ring is non-empty.
    """
    centroids: dict[str, tuple[float, float]] = {}
    for feature in boundaries.features:
        coords = feature.geometry.get("coordinates", [])
        if not coords:
            continue
        ring = coords[0] if feature.geometry.get("type") == "Polygon" else coords[0][0]
        if not ring:
            continue
        lons = [pt[0] for pt in ring]
        lats = [pt[1] for pt in ring]
        centroids[feature.geography_value] = (
            sum(lats) / len(lats),
            sum(lons) / len(lons),
        )
    return centroids

weights_to_pysal ¶

weights_to_pysal(
    weights: dict[str, dict[str, float]],
) -> Any

Convert a weights dict to a :class:libpysal.weights.W object.

Parameters:

Name	Type	Description	Default
`weights`	`dict[str, dict[str, float]]`	Nested dictionary `{unit_a: {unit_b: weight}}` as produced by :func:`build_distance_weights`.	required

Returns:

Type	Description
`Any`	A `libpysal.weights.W` instance suitable for use with PySAL's
`Any`	spatial autocorrelation routines.

Raises:

Type	Description
`ImportError`	If libpysal is not installed. Install the optional stats extra with `pip install nyc311[stats]`.

Source code in src/nyc311/temporal/_spatial_weights.py

def weights_to_pysal(weights: dict[str, dict[str, float]]) -> Any:
    """Convert a weights dict to a :class:`libpysal.weights.W` object.

    Args:
        weights: Nested dictionary ``{unit_a: {unit_b: weight}}`` as
            produced by :func:`build_distance_weights`.

    Returns:
        A ``libpysal.weights.W`` instance suitable for use with PySAL's
        spatial autocorrelation routines.

    Raises:
        ImportError: If libpysal is not installed. Install the optional
            stats extra with ``pip install nyc311[stats]``.
    """
    try:
        from libpysal.weights import W
    except ImportError as exc:
        message = (
            "libpysal is required for spatial weights. "
            "Install with: pip install nyc311[stats]"
        )
        raise ImportError(message) from exc

    neighbors = {uid: list(nbrs) for uid, nbrs in weights.items()}
    weight_values = {uid: list(nbrs.values()) for uid, nbrs in weights.items()}
    return W(neighbors, weight_values)

Stats¶

nyc311.stats ¶

PhD-level statistical modeling for NYC 311 complaint analysis.

STLAnomalyResult `dataclass` ¶

Result of STL-residual anomaly detection.

Source code in src/nyc311/stats/_anomaly.py

@dataclass(frozen=True, slots=True)
class STLAnomalyResult:
    """Result of STL-residual anomaly detection."""

    anomaly_dates: tuple[Any, ...]
    anomaly_scores: tuple[float, ...]
    threshold: float
    n_anomalies: int
    residual_mean: float
    residual_std: float

anomaly_dates `instance-attribute` ¶

anomaly_dates: tuple[Any, ...]

anomaly_scores `instance-attribute` ¶

anomaly_scores: tuple[float, ...]

threshold `instance-attribute` ¶

threshold: float

n_anomalies `instance-attribute` ¶

n_anomalies: int

residual_mean `instance-attribute` ¶

residual_mean: float

residual_std `instance-attribute` ¶

residual_std: float

BYM2Result `dataclass` ¶

Result of BYM2 small-area smoothing.

Source code in src/nyc311/stats/_bym2.py

@dataclass(frozen=True, slots=True)
class BYM2Result:
    """Result of BYM2 small-area smoothing."""

    smoothed_rates: dict[str, float]
    credible_lower: dict[str, float]
    credible_upper: dict[str, float]
    mixing_parameter: float
    spatial_variance: float
    iid_variance: float
    unit_ids: tuple[str, ...]
    n_samples: int
    model_summary: str

smoothed_rates `instance-attribute` ¶

smoothed_rates: dict[str, float]

credible_lower `instance-attribute` ¶

credible_lower: dict[str, float]

credible_upper `instance-attribute` ¶

credible_upper: dict[str, float]

mixing_parameter `instance-attribute` ¶

mixing_parameter: float

spatial_variance `instance-attribute` ¶

spatial_variance: float

iid_variance `instance-attribute` ¶

iid_variance: float

unit_ids `instance-attribute` ¶

unit_ids: tuple[str, ...]

n_samples `instance-attribute` ¶

n_samples: int

model_summary `instance-attribute` ¶

model_summary: str

ChangepointResult `dataclass` ¶

Detected structural breaks in a time series.

Source code in src/nyc311/stats/_changepoint.py

@dataclass(frozen=True, slots=True)
class ChangepointResult:
    """Detected structural breaks in a time series."""

    breakpoints: tuple[int, ...]
    breakpoint_dates: tuple[date, ...]
    n_segments: int
    penalty: float

breakpoints `instance-attribute` ¶

breakpoints: tuple[int, ...]

breakpoint_dates `instance-attribute` ¶

breakpoint_dates: tuple[date, ...]

n_segments `instance-attribute` ¶

n_segments: int

penalty `instance-attribute` ¶

penalty: float

DecompositionResult `dataclass` ¶

Seasonal + trend + residual decomposition.

Source code in src/nyc311/stats/_decomposition.py

@dataclass(frozen=True, slots=True)
class DecompositionResult:
    """Seasonal + trend + residual decomposition."""

    trend: Any
    seasonal: Any
    residual: Any
    period: int

trend `instance-attribute` ¶

trend: Any

seasonal `instance-attribute` ¶

seasonal: Any

residual `instance-attribute` ¶

residual: Any

period `instance-attribute` ¶

period: int

OaxacaBlinderResult `dataclass` ¶

Oaxaca-Blinder decomposition of an outcome gap.

Source code in src/nyc311/stats/_equity.py

@dataclass(frozen=True, slots=True)
class OaxacaBlinderResult:
    """Oaxaca-Blinder decomposition of an outcome gap."""

    mean_group_a: float
    mean_group_b: float
    total_gap: float
    explained: float
    unexplained: float
    component_contributions: dict[str, float]
    n_group_a: int
    n_group_b: int

mean_group_a `instance-attribute` ¶

mean_group_a: float

mean_group_b `instance-attribute` ¶

mean_group_b: float

total_gap `instance-attribute` ¶

total_gap: float

explained `instance-attribute` ¶

explained: float

unexplained `instance-attribute` ¶

unexplained: float

component_contributions `instance-attribute` ¶

component_contributions: dict[str, float]

n_group_a `instance-attribute` ¶

n_group_a: int

n_group_b `instance-attribute` ¶

n_group_b: int

TheilResult `dataclass` ¶

Population-weighted Theil T index with group decomposition.

Source code in src/nyc311/stats/_equity.py

@dataclass(frozen=True, slots=True)
class TheilResult:
    """Population-weighted Theil T index with group decomposition."""

    total: float
    between_group: float
    within_group: float
    unit_contributions: dict[str, float]
    n_units: int

total `instance-attribute` ¶

total: float

between_group `instance-attribute` ¶

between_group: float

within_group `instance-attribute` ¶

within_group: float

unit_contributions `instance-attribute` ¶

unit_contributions: dict[str, float]

n_units `instance-attribute` ¶

n_units: int

GWRResult `dataclass` ¶

Result of a geographically weighted regression.

Source code in src/nyc311/stats/_gwr.py

@dataclass(frozen=True, slots=True)
class GWRResult:
    """Result of a geographically weighted regression."""

    local_coefficients: dict[str, tuple[float, ...]]
    local_r_squared: tuple[float, ...]
    bandwidth: float
    aic: float
    unit_ids: tuple[str, ...]
    global_r_squared: float
    n_observations: int
    model_summary: str

local_coefficients `instance-attribute` ¶

local_coefficients: dict[str, tuple[float, ...]]

local_r_squared `instance-attribute` ¶

local_r_squared: tuple[float, ...]

bandwidth `instance-attribute` ¶

bandwidth: float

aic `instance-attribute` ¶

aic: float

unit_ids `instance-attribute` ¶

unit_ids: tuple[str, ...]

global_r_squared `instance-attribute` ¶

global_r_squared: float

n_observations `instance-attribute` ¶

n_observations: int

model_summary `instance-attribute` ¶

model_summary: str

HawkesResult `dataclass` ¶

Result of a Hawkes process estimation.

Source code in src/nyc311/stats/_hawkes.py

@dataclass(frozen=True, slots=True)
class HawkesResult:
    """Result of a Hawkes process estimation."""

    background_rate: float
    triggering_kernel_alpha: float
    triggering_kernel_beta: float
    branching_ratio: float
    n_events: int
    log_likelihood: float
    model_summary: str

background_rate `instance-attribute` ¶

background_rate: float

triggering_kernel_alpha `instance-attribute` ¶

triggering_kernel_alpha: float

triggering_kernel_beta `instance-attribute` ¶

triggering_kernel_beta: float

branching_ratio `instance-attribute` ¶

branching_ratio: float

n_events `instance-attribute` ¶

n_events: int

log_likelihood `instance-attribute` ¶

log_likelihood: float

model_summary `instance-attribute` ¶

model_summary: str

ITSResult `dataclass` ¶

Result of a segmented interrupted-time-series regression.

Source code in src/nyc311/stats/_its.py

@dataclass(frozen=True, slots=True)
class ITSResult:
    """Result of a segmented interrupted-time-series regression."""

    pre_trend: float
    post_trend: float
    level_change: float
    trend_change: float
    p_value_level: float
    p_value_trend: float
    model_summary: str

pre_trend `instance-attribute` ¶

pre_trend: float

post_trend `instance-attribute` ¶

post_trend: float

level_change `instance-attribute` ¶

level_change: float

trend_change `instance-attribute` ¶

trend_change: float

p_value_level `instance-attribute` ¶

p_value_level: float

p_value_trend `instance-attribute` ¶

p_value_trend: float

model_summary `instance-attribute` ¶

model_summary: str

PanelRegressionResult `dataclass` ¶

Summary of a panel regression fit.

Source code in src/nyc311/stats/_panel_models.py

@dataclass(frozen=True, slots=True)
class PanelRegressionResult:
    """Summary of a panel regression fit."""

    method: str
    coefficients: dict[str, float]
    std_errors: dict[str, float]
    p_values: dict[str, float]
    r_squared: float
    n_observations: int
    n_entities: int
    n_periods: int
    model_summary: str

method `instance-attribute` ¶

method: str

coefficients `instance-attribute` ¶

coefficients: dict[str, float]

std_errors `instance-attribute` ¶

std_errors: dict[str, float]

p_values `instance-attribute` ¶

p_values: dict[str, float]

r_squared `instance-attribute` ¶

r_squared: float

n_observations `instance-attribute` ¶

n_observations: int

n_entities `instance-attribute` ¶

n_entities: int

n_periods `instance-attribute` ¶

n_periods: int

model_summary `instance-attribute` ¶

model_summary: str

PowerResult `dataclass` ¶

Result of a power / minimum detectable effect calculation.

Source code in src/nyc311/stats/_power.py

@dataclass(frozen=True, slots=True)
class PowerResult:
    """Result of a power / minimum detectable effect calculation."""

    mde: float
    alpha: float
    power: float
    n_units: int
    n_periods: int
    icc: float
    variance_explained: float

mde `instance-attribute` ¶

mde: float

alpha `instance-attribute` ¶

alpha: float

power `instance-attribute` ¶

power: float

n_units `instance-attribute` ¶

n_units: int

n_periods `instance-attribute` ¶

n_periods: int

icc `instance-attribute` ¶

icc: float

variance_explained `instance-attribute` ¶

variance_explained: float

RDResult `dataclass` ¶

Result of a regression discontinuity estimation.

Source code in src/nyc311/stats/_rdd.py

@dataclass(frozen=True, slots=True)
class RDResult:
    """Result of a regression discontinuity estimation."""

    treatment_effect: float
    se_robust: float
    p_value: float
    ci_lower: float
    ci_upper: float
    bandwidth_left: float
    bandwidth_right: float
    n_effective_left: int
    n_effective_right: int
    kernel: str
    model_summary: str

treatment_effect `instance-attribute` ¶

treatment_effect: float

se_robust `instance-attribute` ¶

se_robust: float

p_value `instance-attribute` ¶

p_value: float

ci_lower `instance-attribute` ¶

ci_lower: float

ci_upper `instance-attribute` ¶

ci_upper: float

bandwidth_left `instance-attribute` ¶

bandwidth_left: float

bandwidth_right `instance-attribute` ¶

bandwidth_right: float

n_effective_left `instance-attribute` ¶

n_effective_left: int

n_effective_right `instance-attribute` ¶

n_effective_right: int

kernel `instance-attribute` ¶

kernel: str

model_summary `instance-attribute` ¶

model_summary: str

LatentReportingResult `dataclass` ¶

Result of latent reporting-bias EM estimation.

Source code in src/nyc311/stats/_reporting_bias.py

@dataclass(frozen=True, slots=True)
class LatentReportingResult:
    """Result of latent reporting-bias EM estimation."""

    estimated_true_rates: dict[str, float]
    reporting_probabilities: dict[str, float]
    observed_rates: dict[str, float]
    n_iterations: int
    converged: bool
    log_likelihood_trace: tuple[float, ...]

estimated_true_rates `instance-attribute` ¶

estimated_true_rates: dict[str, float]

reporting_probabilities `instance-attribute` ¶

reporting_probabilities: dict[str, float]

observed_rates `instance-attribute` ¶

observed_rates: dict[str, float]

n_iterations `instance-attribute` ¶

n_iterations: int

converged `instance-attribute` ¶

converged: bool

log_likelihood_trace `instance-attribute` ¶

log_likelihood_trace: tuple[float, ...]

ReportingAdjustmentResult `dataclass` ¶

Result of ecometric reporting-rate adjustment.

Source code in src/nyc311/stats/_reporting_bias.py

@dataclass(frozen=True, slots=True)
class ReportingAdjustmentResult:
    """Result of ecometric reporting-rate adjustment."""

    raw_rates: dict[str, float]
    adjusted_rates: dict[str, float]
    adjustment_factors: dict[str, float]
    covariates_used: tuple[str, ...]
    icc: float
    model_summary: str

raw_rates `instance-attribute` ¶

raw_rates: dict[str, float]

adjusted_rates `instance-attribute` ¶

adjusted_rates: dict[str, float]

adjustment_factors `instance-attribute` ¶

adjustment_factors: dict[str, float]

covariates_used `instance-attribute` ¶

covariates_used: tuple[str, ...]

icc `instance-attribute` ¶

icc: float

model_summary `instance-attribute` ¶

model_summary: str

LISAResult `dataclass` ¶

Local Indicators of Spatial Association.

Source code in src/nyc311/stats/_spatial.py

@dataclass(frozen=True, slots=True)
class LISAResult:
    """Local Indicators of Spatial Association."""

    local_statistic: tuple[float, ...]
    p_values: tuple[float, ...]
    cluster_labels: tuple[str, ...]
    unit_ids: tuple[str, ...]

local_statistic `instance-attribute` ¶

local_statistic: tuple[float, ...]

p_values `instance-attribute` ¶

p_values: tuple[float, ...]

cluster_labels `instance-attribute` ¶

cluster_labels: tuple[str, ...]

unit_ids `instance-attribute` ¶

unit_ids: tuple[str, ...]

MoranResult `dataclass` ¶

Global Moran's I test result.

Source code in src/nyc311/stats/_spatial.py

@dataclass(frozen=True, slots=True)
class MoranResult:
    """Global Moran's I test result."""

    statistic: float
    p_value: float
    z_score: float
    expected: float

statistic `instance-attribute` ¶

statistic: float

p_value `instance-attribute` ¶

p_value: float

z_score `instance-attribute` ¶

z_score: float

expected `instance-attribute` ¶

expected: float

SpatialErrorResult `dataclass` ¶

Result of a spatial error (SEM) model.

Source code in src/nyc311/stats/_spatial_regression.py

@dataclass(frozen=True, slots=True)
class SpatialErrorResult:
    """Result of a spatial error (SEM) model."""

    coefficients: dict[str, float]
    std_errors: dict[str, float]
    p_values: dict[str, float]
    lam: float
    lam_p_value: float
    log_likelihood: float
    aic: float
    n_observations: int
    model_summary: str

coefficients `instance-attribute` ¶

coefficients: dict[str, float]

std_errors `instance-attribute` ¶

std_errors: dict[str, float]

p_values `instance-attribute` ¶

p_values: dict[str, float]

lam `instance-attribute` ¶

lam: float

lam_p_value `instance-attribute` ¶

lam_p_value: float

log_likelihood `instance-attribute` ¶

log_likelihood: float

aic `instance-attribute` ¶

aic: float

n_observations `instance-attribute` ¶

n_observations: int

model_summary `instance-attribute` ¶

model_summary: str

SpatialLagResult `dataclass` ¶

Result of a spatial lag (SAR) model.

Source code in src/nyc311/stats/_spatial_regression.py

@dataclass(frozen=True, slots=True)
class SpatialLagResult:
    """Result of a spatial lag (SAR) model."""

    coefficients: dict[str, float]
    std_errors: dict[str, float]
    p_values: dict[str, float]
    rho: float
    rho_p_value: float
    log_likelihood: float
    aic: float
    n_observations: int
    model_summary: str

coefficients `instance-attribute` ¶

coefficients: dict[str, float]

std_errors `instance-attribute` ¶

std_errors: dict[str, float]

p_values `instance-attribute` ¶

p_values: dict[str, float]

rho `instance-attribute` ¶

rho: float

rho_p_value `instance-attribute` ¶

rho_p_value: float

log_likelihood `instance-attribute` ¶

log_likelihood: float

aic `instance-attribute` ¶

aic: float

n_observations `instance-attribute` ¶

n_observations: int

model_summary `instance-attribute` ¶

model_summary: str

EventStudyResult `dataclass` ¶

Event-study coefficients with pre-trend diagnostics.

Source code in src/nyc311/stats/_staggered_did.py

@dataclass(frozen=True, slots=True)
class EventStudyResult:
    """Event-study coefficients with pre-trend diagnostics."""

    coefficients: tuple[float, ...]
    std_errors: tuple[float, ...]
    ci_lower: tuple[float, ...]
    ci_upper: tuple[float, ...]
    relative_periods: tuple[int, ...]
    pre_trend_f_statistic: float | None
    pre_trend_p_value: float | None
    reference_period: int

coefficients `instance-attribute` ¶

coefficients: tuple[float, ...]

std_errors `instance-attribute` ¶

std_errors: tuple[float, ...]

ci_lower `instance-attribute` ¶

ci_lower: tuple[float, ...]

ci_upper `instance-attribute` ¶

ci_upper: tuple[float, ...]

relative_periods `instance-attribute` ¶

relative_periods: tuple[int, ...]

pre_trend_f_statistic `instance-attribute` ¶

pre_trend_f_statistic: float | None

pre_trend_p_value `instance-attribute` ¶

pre_trend_p_value: float | None

reference_period `instance-attribute` ¶

reference_period: int

GroupTimeATT `dataclass` ¶

A single group-time average treatment effect.

Source code in src/nyc311/stats/_staggered_did.py

@dataclass(frozen=True, slots=True)
class GroupTimeATT:
    """A single group-time average treatment effect."""

    group: str
    period: str
    att: float
    se: float
    p_value: float

group `instance-attribute` ¶

group: str

period `instance-attribute` ¶

period: str

att `instance-attribute` ¶

att: float

se `instance-attribute` ¶

se: float

p_value `instance-attribute` ¶

p_value: float

StaggeredDiDResult `dataclass` ¶

Result of a staggered difference-in-differences estimation.

Source code in src/nyc311/stats/_staggered_did.py

@dataclass(frozen=True, slots=True)
class StaggeredDiDResult:
    """Result of a staggered difference-in-differences estimation."""

    group_time_atts: tuple[GroupTimeATT, ...]
    aggregated_att: float
    aggregated_se: float
    aggregated_p_value: float
    aggregated_ci_lower: float
    aggregated_ci_upper: float
    n_groups: int
    n_periods: int
    model_summary: str

group_time_atts `instance-attribute` ¶

group_time_atts: tuple[GroupTimeATT, ...]

aggregated_att `instance-attribute` ¶

aggregated_att: float

aggregated_se `instance-attribute` ¶

aggregated_se: float

aggregated_p_value `instance-attribute` ¶

aggregated_p_value: float

aggregated_ci_lower `instance-attribute` ¶

aggregated_ci_lower: float

aggregated_ci_upper `instance-attribute` ¶

aggregated_ci_upper: float

n_groups `instance-attribute` ¶

n_groups: int

n_periods `instance-attribute` ¶

n_periods: int

model_summary `instance-attribute` ¶

model_summary: str

SyntheticControlResult `dataclass` ¶

Result of a synthetic control analysis.

Source code in src/nyc311/stats/_synthetic_control.py

@dataclass(frozen=True, slots=True)
class SyntheticControlResult:
    """Result of a synthetic control analysis."""

    treated_unit: str
    donor_weights: dict[str, float]
    counterfactual: tuple[float, ...]
    observed: tuple[float, ...]
    treatment_effect: tuple[float, ...]
    att: float
    periods: tuple[str, ...]
    pre_treatment_mspe: float
    placebo_p_value: float | None
    model_summary: str

treated_unit `instance-attribute` ¶

treated_unit: str

donor_weights `instance-attribute` ¶

donor_weights: dict[str, float]

counterfactual `instance-attribute` ¶

counterfactual: tuple[float, ...]

observed `instance-attribute` ¶

observed: tuple[float, ...]

treatment_effect `instance-attribute` ¶

treatment_effect: tuple[float, ...]

att `instance-attribute` ¶

att: float

periods `instance-attribute` ¶

periods: tuple[str, ...]

pre_treatment_mspe `instance-attribute` ¶

pre_treatment_mspe: float

placebo_p_value `instance-attribute` ¶

placebo_p_value: float | None

model_summary `instance-attribute` ¶

model_summary: str

detect_stl_anomalies ¶

detect_stl_anomalies(
    series: Any,
    *,
    period: int | None = None,
    threshold: float = 2.0,
) -> STLAnomalyResult

Detect anomalies using STL decomposition residuals.

Decomposes series via STL and flags observations whose absolute residual z-score exceeds threshold.

Parameters:

Name	Type	Description	Default
`series`	`Any`	A `pandas.Series` indexed by a `DatetimeIndex`.	required
`period`	`int \| None`	Seasonal period in observations. When `None`, the period is inferred from the index frequency.	`None`
`threshold`	`float`	Absolute z-score threshold above which an observation is flagged as anomalous. Defaults to `2.0`.	`2.0`

Returns:

Name	Type	Description
`An`	`STLAnomalyResult`	class:`STLAnomalyResult` with the anomaly dates, their
	`STLAnomalyResult`	z-scores, and summary statistics of the residual distribution.

Raises:

Type	Description
`ImportError`	If statsmodels or pandas is not installed. Install with `pip install nyc311[stats]`.

Source code in src/nyc311/stats/_anomaly.py

def detect_stl_anomalies(
    series: Any,
    *,
    period: int | None = None,
    threshold: float = 2.0,
) -> STLAnomalyResult:
    """Detect anomalies using STL decomposition residuals.

    Decomposes ``series`` via STL and flags observations whose
    absolute residual z-score exceeds ``threshold``.

    Args:
        series: A ``pandas.Series`` indexed by a ``DatetimeIndex``.
        period: Seasonal period in observations.  When ``None``, the
            period is inferred from the index frequency.
        threshold: Absolute z-score threshold above which an
            observation is flagged as anomalous.  Defaults to ``2.0``.

    Returns:
        An :class:`STLAnomalyResult` with the anomaly dates, their
        z-scores, and summary statistics of the residual distribution.

    Raises:
        ImportError: If statsmodels or pandas is not installed.
            Install with ``pip install nyc311[stats]``.
    """
    try:
        import numpy as np
    except ImportError as exc:
        msg = "numpy is required for detect_stl_anomalies(). Install with: pip install nyc311[stats]"
        raise ImportError(msg) from exc

    from nyc311.stats._decomposition import seasonal_decompose

    decomp = seasonal_decompose(series, period=period)
    residual = decomp.residual.dropna()

    resid_values = np.asarray(residual.values, dtype=float)
    mu = float(np.mean(resid_values))
    sigma = float(np.std(resid_values, ddof=1)) if len(resid_values) > 1 else 0.0

    if sigma == 0.0:
        return STLAnomalyResult(
            anomaly_dates=(),
            anomaly_scores=(),
            threshold=threshold,
            n_anomalies=0,
            residual_mean=mu,
            residual_std=0.0,
        )

    z_scores = (resid_values - mu) / sigma
    mask = np.abs(z_scores) > threshold

    anomaly_dates = tuple(residual.index[mask])
    anomaly_scores = tuple(float(z) for z in z_scores[mask])

    return STLAnomalyResult(
        anomaly_dates=anomaly_dates,
        anomaly_scores=anomaly_scores,
        threshold=threshold,
        n_anomalies=int(mask.sum()),
        residual_mean=mu,
        residual_std=sigma,
    )

bym2_smooth ¶

bym2_smooth(
    observed_counts: dict[str, int],
    expected_counts: dict[str, float],
    adjacency: dict[str, tuple[str, ...]],
    *,
    n_samples: int = 2000,
    n_tune: int = 1000,
    random_seed: int = 42,
) -> BYM2Result

Smooth area-level rates with the BYM2 model.

Estimates: y_i ~ Poisson(E_i * exp(mu + phi_i))

where phi_i = sqrt(rho) * spatial_i + sqrt(1 - rho) * iid_i

The mixing parameter rho controls the balance between spatially structured and unstructured random effects.

Parameters:

Name	Type	Description	Default
`observed_counts`	`dict[str, int]`	Mapping `{unit_id: observed_count}`.	required
`expected_counts`	`dict[str, float]`	Mapping `{unit_id: expected_count}`.	required
`adjacency`	`dict[str, tuple[str, ...]]`	Mapping `{unit_id: (neighbor_ids,...)}`.	required
`n_samples`	`int`	Number of posterior draws after tuning.	`2000`
`n_tune`	`int`	Number of warmup / tuning iterations.	`1000`
`random_seed`	`int`	Random seed for reproducibility.	`42`

Returns:

Name	Type	Description
`A`	`BYM2Result`	class:`BYM2Result` with smoothed rates, 95% credible
	`BYM2Result`	intervals, and variance decomposition.

Raises:

Type	Description
`ImportError`	If pymc is not installed.

Source code in src/nyc311/stats/_bym2.py

def bym2_smooth(
    observed_counts: dict[str, int],
    expected_counts: dict[str, float],
    adjacency: dict[str, tuple[str, ...]],
    *,
    n_samples: int = 2000,
    n_tune: int = 1000,
    random_seed: int = 42,
) -> BYM2Result:
    """Smooth area-level rates with the BYM2 model.

    Estimates: y_i ~ Poisson(E_i * exp(mu + phi_i))

    where phi_i = sqrt(rho) * spatial_i + sqrt(1 - rho) * iid_i

    The mixing parameter rho controls the balance between spatially
    structured and unstructured random effects.

    Args:
        observed_counts: Mapping ``{unit_id: observed_count}``.
        expected_counts: Mapping ``{unit_id: expected_count}``.
        adjacency: Mapping ``{unit_id: (neighbor_ids,...)}``.
        n_samples: Number of posterior draws after tuning.
        n_tune: Number of warmup / tuning iterations.
        random_seed: Random seed for reproducibility.

    Returns:
        A :class:`BYM2Result` with smoothed rates, 95% credible
        intervals, and variance decomposition.

    Raises:
        ImportError: If pymc is not installed.
    """
    try:
        import numpy as np
        import pymc as pm
    except ImportError as exc:
        msg = (
            "pymc is required for bym2_smooth(). "
            "Install with: pip install nyc311[bayes]"
        )
        raise ImportError(msg) from exc

    unit_ids = sorted(observed_counts)
    n = len(unit_ids)
    uid_to_idx = {uid: i for i, uid in enumerate(unit_ids)}

    y = np.array([observed_counts[uid] for uid in unit_ids], dtype=float)
    e = np.array([expected_counts[uid] for uid in unit_ids], dtype=float)

    adj_pairs: list[tuple[int, int]] = []
    for uid in unit_ids:
        for nb in adjacency.get(uid, ()):
            if nb in uid_to_idx:
                i, j = uid_to_idx[uid], uid_to_idx[nb]
                if i < j:
                    adj_pairs.append((i, j))

    node1 = np.array([p[0] for p in adj_pairs])
    node2 = np.array([p[1] for p in adj_pairs])

    with pm.Model() as _model:
        mu = pm.Normal("mu", mu=0, sigma=1)
        sigma = pm.HalfNormal("sigma", sigma=1)
        rho = pm.Beta("rho", alpha=1, beta=1)

        theta = pm.Normal("theta", mu=0, sigma=1, shape=n)
        phi = pm.ICAR("phi", W=_build_adjacency_matrix(n, node1, node2))

        psi = pm.Deterministic(
            "psi",
            mu + sigma * (pm.math.sqrt(rho) * phi + pm.math.sqrt(1 - rho) * theta),
        )
        rate = pm.Deterministic("rate", pm.math.exp(psi))

        pm.Poisson("obs", mu=e * rate, observed=y)

        trace = pm.sample(
            draws=n_samples,
            tune=n_tune,
            random_seed=random_seed,
            progressbar=False,
            return_inferencedata=True,
        )

    rate_samples = trace.posterior["rate"].values.reshape(-1, n)
    smoothed = rate_samples.mean(axis=0)
    lower = np.percentile(rate_samples, 2.5, axis=0)
    upper = np.percentile(rate_samples, 97.5, axis=0)

    rho_samples = trace.posterior["rho"].values.flatten()
    sigma_samples = trace.posterior["sigma"].values.flatten()
    mixing = float(np.mean(rho_samples))
    total_var = float(np.mean(sigma_samples**2))
    spatial_var = mixing * total_var
    iid_var = (1 - mixing) * total_var

    summary = (
        f"BYM2: {n} areas, {len(adj_pairs)} edges\n"
        f"Mixing (rho): {mixing:.3f}\n"
        f"Total variance (sigma^2): {total_var:.4f}\n"
        f"Spatial / IID: {spatial_var:.4f} / {iid_var:.4f}"
    )

    return BYM2Result(
        smoothed_rates={uid: float(smoothed[i]) for i, uid in enumerate(unit_ids)},
        credible_lower={uid: float(lower[i]) for i, uid in enumerate(unit_ids)},
        credible_upper={uid: float(upper[i]) for i, uid in enumerate(unit_ids)},
        mixing_parameter=mixing,
        spatial_variance=spatial_var,
        iid_variance=iid_var,
        unit_ids=tuple(unit_ids),
        n_samples=n_samples,
        model_summary=summary,
    )

detect_changepoints ¶

detect_changepoints(
    series: Any,
    *,
    method: Literal["pelt", "binseg"] = "pelt",
    penalty: float | None = None,
    min_segment_size: int = 5,
) -> ChangepointResult

Detect structural breaks in a complaint time series.

Parameters:

Name	Type	Description	Default
`series`	`Any`	A `pandas.Series` indexed by a `DatetimeIndex`.	required
`method`	`Literal['pelt', 'binseg']`	Detection algorithm; one of `"pelt"` (default, optimal) or `"binseg"` (binary segmentation, faster but approximate).	`'pelt'`
`penalty`	`float \| None`	Penalty value passed to the underlying `ruptures` algorithm. When `None`, defaults to `log(n) * variance`, a BIC-like heuristic.	`None`
`min_segment_size`	`int`	Minimum number of observations between consecutive changepoints.	`5`

Returns:

Name	Type	Description
`A`	`ChangepointResult`	class:`ChangepointResult` containing the integer breakpoint
	`ChangepointResult`	indices, their corresponding dates, the resulting segment count,
	`ChangepointResult`	and the penalty actually used.

Raises:

Type	Description
`ImportError`	If `ruptures` or pandas is not installed. Install the optional stats extra with `pip install nyc311[stats]`.
`TypeError`	If `series` does not use a `DatetimeIndex`.

Source code in src/nyc311/stats/_changepoint.py

def detect_changepoints(
    series: Any,
    *,
    method: Literal["pelt", "binseg"] = "pelt",
    penalty: float | None = None,
    min_segment_size: int = 5,
) -> ChangepointResult:
    """Detect structural breaks in a complaint time series.

    Args:
        series: A ``pandas.Series`` indexed by a ``DatetimeIndex``.
        method: Detection algorithm; one of ``"pelt"`` (default,
            optimal) or ``"binseg"`` (binary segmentation, faster but
            approximate).
        penalty: Penalty value passed to the underlying ``ruptures``
            algorithm. When ``None``, defaults to ``log(n) * variance``,
            a BIC-like heuristic.
        min_segment_size: Minimum number of observations between
            consecutive changepoints.

    Returns:
        A :class:`ChangepointResult` containing the integer breakpoint
        indices, their corresponding dates, the resulting segment count,
        and the penalty actually used.

    Raises:
        ImportError: If ``ruptures`` or pandas is not installed. Install
            the optional stats extra with ``pip install nyc311[stats]``.
        TypeError: If ``series`` does not use a ``DatetimeIndex``.
    """
    try:
        import numpy as np
        import pandas as pd
        import ruptures as rpt
    except ImportError as exc:
        message = (
            "ruptures and pandas are required for detect_changepoints(). "
            "Install with: pip install nyc311[stats]"
        )
        raise ImportError(message) from exc

    if not isinstance(series.index, pd.DatetimeIndex):
        msg = "series must have a DatetimeIndex."
        raise TypeError(msg)

    signal = series.dropna().to_numpy().astype(float)
    n = len(signal)

    if penalty is None:
        penalty = float(np.log(n) * np.var(signal)) if n > 1 else 1.0

    if method == "pelt":
        algo = rpt.Pelt(model="l2", min_size=min_segment_size).fit(signal)
    else:
        algo = rpt.Binseg(model="l2", min_size=min_segment_size).fit(signal)

    raw_breaks: list[int] = algo.predict(pen=penalty)
    # ruptures returns the *end* of each segment; the last element equals n
    breakpoint_indices = [b for b in raw_breaks if b < n]

    dates_index = series.dropna().index
    breakpoint_dates: list[date] = []
    for idx in breakpoint_indices:
        ts = dates_index[idx]
        breakpoint_dates.append(ts.date() if hasattr(ts, "date") else ts)

    return ChangepointResult(
        breakpoints=tuple(breakpoint_indices),
        breakpoint_dates=tuple(breakpoint_dates),
        n_segments=len(breakpoint_indices) + 1,
        penalty=penalty,
    )

seasonal_decompose ¶

seasonal_decompose(
    series: Any, *, period: int | None = None
) -> DecompositionResult

Decompose series into trend, seasonal, and residual components.

Wraps :class:statsmodels.tsa.seasonal.STL. The series must be indexed by a DatetimeIndex.

Parameters:

Name	Type	Description	Default
`series`	`Any`	A `pandas.Series` indexed by a `DatetimeIndex`.	required
`period`	`int \| None`	Seasonal period in observations. When `None`, the period is inferred from the index frequency (monthly → 12, weekly → 52, daily → 7, quarterly → 4, yearly → 1).	`None`

Returns:

Name	Type	Description
`A`	`DecompositionResult`	class:`DecompositionResult` exposing the trend, seasonal, and
	`DecompositionResult`	residual `pandas.Series` plus the period actually used.

Raises:

Type	Description
`ImportError`	If statsmodels or pandas is not installed. Install the optional stats extra with `pip install nyc311[stats]`.
`TypeError`	If `series` does not use a `DatetimeIndex`.

Source code in src/nyc311/stats/_decomposition.py

def seasonal_decompose(
    series: Any,
    *,
    period: int | None = None,
) -> DecompositionResult:
    """Decompose ``series`` into trend, seasonal, and residual components.

    Wraps :class:`statsmodels.tsa.seasonal.STL`. The series must be
    indexed by a ``DatetimeIndex``.

    Args:
        series: A ``pandas.Series`` indexed by a ``DatetimeIndex``.
        period: Seasonal period in observations. When ``None``, the
            period is inferred from the index frequency (monthly → 12,
            weekly → 52, daily → 7, quarterly → 4, yearly → 1).

    Returns:
        A :class:`DecompositionResult` exposing the trend, seasonal, and
        residual ``pandas.Series`` plus the period actually used.

    Raises:
        ImportError: If statsmodels or pandas is not installed. Install
            the optional stats extra with ``pip install nyc311[stats]``.
        TypeError: If ``series`` does not use a ``DatetimeIndex``.
    """
    try:
        import pandas as pd
        from statsmodels.tsa.seasonal import STL
    except ImportError as exc:
        message = (
            "statsmodels and pandas are required for seasonal_decompose(). "
            "Install with: pip install nyc311[stats]"
        )
        raise ImportError(message) from exc

    if not isinstance(series.index, pd.DatetimeIndex):
        msg = "series must have a DatetimeIndex."
        raise TypeError(msg)

    if period is None:
        freq = pd.infer_freq(series.index)
        period = _infer_period(freq)

    result = STL(series.dropna(), period=period).fit()
    return DecompositionResult(
        trend=result.trend,
        seasonal=result.seasonal,
        residual=result.resid,
        period=period,
    )

oaxaca_blinder_decomposition ¶

oaxaca_blinder_decomposition(
    group_a: Any,
    group_b: Any,
    outcome: str,
    regressors: tuple[str, ...],
) -> OaxacaBlinderResult

Decompose the mean-outcome gap between two groups.

Uses the Oaxaca-Blinder twofold decomposition with group B coefficients as the reference:

gap = (mean(X_a) - mean(X_b)) @ beta_b  [explained]
    + mean(X_a) @ (beta_a - beta_b)      [unexplained]

Parameters:

Name	Type	Description	Default
`group_a`	`Any`	`pandas.DataFrame` for the first group.	required
`group_b`	`Any`	`pandas.DataFrame` for the second group.	required
`outcome`	`str`	Name of the outcome column.	required
`regressors`	`tuple[str, ...]`	Column names to include as explanatory variables.	required

Returns:

Name	Type	Description
`An`	`OaxacaBlinderResult`	class:`OaxacaBlinderResult` with the total gap, explained
	`OaxacaBlinderResult`	and unexplained components, and per-variable contributions.

Raises:

Type	Description
`ImportError`	If numpy or pandas is not installed.
`ValueError`	If fewer than 2 observations exist in either group.

Source code in src/nyc311/stats/_equity.py

def oaxaca_blinder_decomposition(
    group_a: Any,
    group_b: Any,
    outcome: str,
    regressors: tuple[str, ...],
) -> OaxacaBlinderResult:
    """Decompose the mean-outcome gap between two groups.

    Uses the Oaxaca-Blinder twofold decomposition with group B
    coefficients as the reference:

        gap = (mean(X_a) - mean(X_b)) @ beta_b  [explained]
            + mean(X_a) @ (beta_a - beta_b)      [unexplained]

    Args:
        group_a: ``pandas.DataFrame`` for the first group.
        group_b: ``pandas.DataFrame`` for the second group.
        outcome: Name of the outcome column.
        regressors: Column names to include as explanatory variables.

    Returns:
        An :class:`OaxacaBlinderResult` with the total gap, explained
        and unexplained components, and per-variable contributions.

    Raises:
        ImportError: If numpy or pandas is not installed.
        ValueError: If fewer than 2 observations exist in either group.
    """
    try:
        import numpy as np
    except ImportError as exc:
        msg = "numpy is required for oaxaca_blinder_decomposition(). Install with: pip install nyc311[stats]"
        raise ImportError(msg) from exc

    ya = np.asarray(group_a[outcome].values, dtype=float)
    yb = np.asarray(group_b[outcome].values, dtype=float)

    if len(ya) < 2 or len(yb) < 2:
        msg = "Each group must have at least 2 observations."
        raise ValueError(msg)

    xa = np.column_stack(
        [np.asarray(group_a[r].values, dtype=float) for r in regressors]
    )
    xb = np.column_stack(
        [np.asarray(group_b[r].values, dtype=float) for r in regressors]
    )

    xa_with_const = np.column_stack([np.ones(len(xa)), xa])
    xb_with_const = np.column_stack([np.ones(len(xb)), xb])

    beta_a = np.linalg.lstsq(xa_with_const, ya, rcond=None)[0]
    beta_b = np.linalg.lstsq(xb_with_const, yb, rcond=None)[0]

    mean_xa = xa.mean(axis=0)
    mean_xb = xb.mean(axis=0)

    mean_a = float(ya.mean())
    mean_b = float(yb.mean())
    total_gap = mean_a - mean_b

    explained_components = (mean_xa - mean_xb) * beta_b[1:]
    explained = float(explained_components.sum())
    mean_xa_with_const = np.concatenate([[1], mean_xa])
    unexplained = float(mean_xa_with_const @ (beta_a - beta_b))

    contributions = {
        name: float(explained_components[i]) for i, name in enumerate(regressors)
    }

    return OaxacaBlinderResult(
        mean_group_a=mean_a,
        mean_group_b=mean_b,
        total_gap=total_gap,
        explained=explained,
        unexplained=unexplained,
        component_contributions=contributions,
        n_group_a=len(ya),
        n_group_b=len(yb),
    )

theil_index ¶

theil_index(
    values: dict[str, float],
    populations: dict[str, int],
    *,
    groups: dict[str, str] | None = None,
) -> TheilResult

Compute the population-weighted Theil T index.

When groups is provided, decomposes the total index into between-group and within-group components.

Parameters:

Name	Type	Description	Default
`values`	`dict[str, float]`	Mapping `{unit_id: value}` of the variable to measure inequality over (e.g. complaint rate).	required
`populations`	`dict[str, int]`	Mapping `{unit_id: population}` for weighting.	required
`groups`	`dict[str, str] \| None`	Optional mapping `{unit_id: group_label}` for decomposition. When `None`, between-group and within-group are both set to `0.0`.	`None`

Returns:

Name	Type	Description
`A`	`TheilResult`	class:`TheilResult` with the total index, between/within
	`TheilResult`	components, per-unit contributions, and count.

Raises:

Type	Description
`ImportError`	If numpy is not installed.
`ValueError`	If values and populations have mismatched keys.

Source code in src/nyc311/stats/_equity.py

def theil_index(
    values: dict[str, float],
    populations: dict[str, int],
    *,
    groups: dict[str, str] | None = None,
) -> TheilResult:
    """Compute the population-weighted Theil T index.

    When ``groups`` is provided, decomposes the total index into
    between-group and within-group components.

    Args:
        values: Mapping ``{unit_id: value}`` of the variable to
            measure inequality over (e.g. complaint rate).
        populations: Mapping ``{unit_id: population}`` for weighting.
        groups: Optional mapping ``{unit_id: group_label}`` for
            decomposition. When ``None``, between-group and
            within-group are both set to ``0.0``.

    Returns:
        A :class:`TheilResult` with the total index, between/within
        components, per-unit contributions, and count.

    Raises:
        ImportError: If numpy is not installed.
        ValueError: If values and populations have mismatched keys.
    """
    try:
        import numpy as np
    except ImportError as exc:
        msg = "numpy is required for theil_index(). Install with: pip install nyc311[stats]"
        raise ImportError(msg) from exc

    unit_ids = sorted(values)
    if set(unit_ids) != set(populations):
        msg = "values and populations must have the same keys."
        raise ValueError(msg)

    v = np.array([values[uid] for uid in unit_ids], dtype=float)
    p = np.array([populations[uid] for uid in unit_ids], dtype=float)

    total_pop = p.sum()
    total_value = (v * p).sum()

    if total_value <= 0 or total_pop <= 0:
        return TheilResult(
            total=0.0,
            between_group=0.0,
            within_group=0.0,
            unit_contributions=dict.fromkeys(unit_ids, 0.0),
            n_units=len(unit_ids),
        )

    mu = total_value / total_pop
    shares = (v * p) / total_value

    with np.errstate(divide="ignore", invalid="ignore"):
        log_ratios = np.where(v > 0, np.log(v / mu), 0.0)

    contributions_arr = shares * log_ratios
    total_t = float(np.sum(contributions_arr))

    unit_contributions = {
        uid: float(contributions_arr[i]) for i, uid in enumerate(unit_ids)
    }

    between = 0.0
    within = 0.0
    if groups is not None:
        group_labels = sorted(set(groups.values()))
        for g in group_labels:
            member_mask = np.array([groups.get(uid) == g for uid in unit_ids])
            g_pop = p[member_mask].sum()
            g_value = (v[member_mask] * p[member_mask]).sum()
            if g_pop <= 0 or g_value <= 0:
                continue
            g_mu = g_value / g_pop
            g_share = g_value / total_value
            between += g_share * float(np.log(g_mu / mu))

            g_v = v[member_mask]
            g_p = p[member_mask]
            g_shares = (g_v * g_p) / g_value
            with np.errstate(divide="ignore", invalid="ignore"):
                g_log = np.where(g_v > 0, np.log(g_v / g_mu), 0.0)
            within += g_share * float(np.sum(g_shares * g_log))

    return TheilResult(
        total=total_t,
        between_group=between,
        within_group=within,
        unit_contributions=unit_contributions,
        n_units=len(unit_ids),
    )

geographically_weighted_regression ¶

geographically_weighted_regression(
    values: dict[str, float],
    regressors: dict[str, dict[str, float]],
    coordinates: dict[str, tuple[float, float]],
    *,
    bandwidth: float | None = None,
    kernel: str = "bisquare",
) -> GWRResult

Fit a geographically weighted regression.

Estimates locally varying coefficients, allowing the relationship between outcome and regressors to change across space.

Parameters:

Name	Type	Description	Default
`values`	`dict[str, float]`	Mapping `{unit_id: outcome_value}`.	required
`regressors`	`dict[str, dict[str, float]]`	Mapping `{unit_id: {variable_name: value}}`.	required
`coordinates`	`dict[str, tuple[float, float]]`	Mapping `{unit_id: (latitude, longitude)}`.	required
`bandwidth`	`float \| None`	Fixed bandwidth. When `None`, an optimal bandwidth is selected via cross-validation.	`None`
`kernel`	`str`	Kernel function. One of `"bisquare"` (default), `"gaussian"`, or `"exponential"`.	`'bisquare'`

Returns:

Name	Type	Description
`A`	`GWRResult`	class:`GWRResult` with local coefficients per unit,
	`GWRResult`	local R-squared values, bandwidth, and fit statistics.

Raises:

Type	Description
`ImportError`	If mgwr is not installed.
`ValueError`	If fewer than 5 observations are provided.

Source code in src/nyc311/stats/_gwr.py

def geographically_weighted_regression(
    values: dict[str, float],
    regressors: dict[str, dict[str, float]],
    coordinates: dict[str, tuple[float, float]],
    *,
    bandwidth: float | None = None,
    kernel: str = "bisquare",
) -> GWRResult:
    """Fit a geographically weighted regression.

    Estimates locally varying coefficients, allowing the relationship
    between outcome and regressors to change across space.

    Args:
        values: Mapping ``{unit_id: outcome_value}``.
        regressors: Mapping
            ``{unit_id: {variable_name: value}}``.
        coordinates: Mapping ``{unit_id: (latitude, longitude)}``.
        bandwidth: Fixed bandwidth.  When ``None``, an optimal
            bandwidth is selected via cross-validation.
        kernel: Kernel function.  One of ``"bisquare"`` (default),
            ``"gaussian"``, or ``"exponential"``.

    Returns:
        A :class:`GWRResult` with local coefficients per unit,
        local R-squared values, bandwidth, and fit statistics.

    Raises:
        ImportError: If mgwr is not installed.
        ValueError: If fewer than 5 observations are provided.
    """
    try:
        import numpy as np
        from scipy.spatial.distance import cdist
    except ImportError as exc:
        msg = (
            "numpy and scipy are required for "
            "geographically_weighted_regression(). "
            "Install with: pip install nyc311[spatial-regression]"
        )
        raise ImportError(msg) from exc

    unit_ids = sorted(values)
    if len(unit_ids) < 5:
        msg = "GWR requires at least 5 observations."
        raise ValueError(msg)

    var_names = sorted(next(iter(regressors.values())).keys())
    y = np.array([values[uid] for uid in unit_ids], dtype=float)
    x_raw = np.column_stack(
        [
            np.array([regressors[uid][v] for uid in unit_ids], dtype=float)
            for v in var_names
        ]
    )
    x = np.column_stack([np.ones(len(unit_ids)), x_raw])
    coords = np.array([coordinates[uid] for uid in unit_ids], dtype=float)

    dists = cdist(coords, coords)

    if bandwidth is None:
        bandwidth = _cv_bandwidth(y, x, dists, kernel)

    all_names = ["CONSTANT", *var_names]
    n = len(unit_ids)
    k = x.shape[1]
    local_betas = np.zeros((n, k))
    local_r2 = np.zeros(n)
    y_hat_global = np.zeros(n)

    for i in range(n):
        w_i = _kernel_weights(dists[i], bandwidth, kernel)
        w_diag = np.diag(w_i)
        xtwx = x.T @ w_diag @ x
        xtwy = x.T @ w_diag @ y
        try:
            beta_i = np.linalg.solve(xtwx, xtwy)
        except np.linalg.LinAlgError:
            beta_i = np.linalg.lstsq(xtwx, xtwy, rcond=None)[0]
        local_betas[i] = beta_i
        y_hat_i = x[i] @ beta_i
        y_hat_global[i] = y_hat_i

        ss_tot = float(np.sum(w_i * (y - np.average(y, weights=w_i)) ** 2))
        ss_res = float(np.sum(w_i * (y - x @ beta_i) ** 2))
        local_r2[i] = 1.0 - ss_res / ss_tot if ss_tot > 0 else 0.0

    ss_tot_global = float(np.sum((y - np.mean(y)) ** 2))
    ss_res_global = float(np.sum((y - y_hat_global) ** 2))
    global_r2 = 1.0 - ss_res_global / ss_tot_global if ss_tot_global > 0 else 0.0

    aic_val = n * np.log(ss_res_global / n) + 2 * k

    local_coefficients = {
        name: tuple(float(local_betas[i, j]) for i in range(n))
        for j, name in enumerate(all_names)
    }

    summary = (
        f"GWR: {n} observations, {k} parameters\n"
        f"Bandwidth: {bandwidth:.4f}, Kernel: {kernel}\n"
        f"Global R-squared: {global_r2:.4f}, AIC: {aic_val:.2f}"
    )

    return GWRResult(
        local_coefficients=local_coefficients,
        local_r_squared=tuple(float(r) for r in local_r2),
        bandwidth=float(bandwidth),
        aic=float(aic_val),
        unit_ids=tuple(unit_ids),
        global_r_squared=float(global_r2),
        n_observations=n,
        model_summary=summary,
    )

fit_hawkes_process ¶

fit_hawkes_process(
    event_times: Any,
    *,
    kernel: str = "exponential",
    max_iter: int = 1000,
) -> HawkesResult

Fit a univariate Hawkes process to event timestamps.

The conditional intensity is:

lambda(t) = mu + sum_{t_i < t} alpha * beta * exp(-beta * (t - t_i))

Parameters:

Name	Type	Description	Default
`event_times`	`Any`	Array-like of event timestamps as floats (e.g. seconds since epoch, or days since start).	required
`kernel`	`str`	Triggering kernel type. Currently only `"exponential"` is supported.	`'exponential'`
`max_iter`	`int`	Maximum iterations for the EM algorithm.	`1000`

Returns:

Name	Type	Description
`A`	`HawkesResult`	class:`HawkesResult` with background rate, triggering
	`HawkesResult`	kernel parameters, branching ratio, and log-likelihood.

Raises:

Type	Description
`ImportError`	If numpy or scipy is not installed.
`ValueError`	If fewer than 3 events are provided.

Source code in src/nyc311/stats/_hawkes.py

def fit_hawkes_process(
    event_times: Any,
    *,
    kernel: str = "exponential",
    max_iter: int = 1000,
) -> HawkesResult:
    """Fit a univariate Hawkes process to event timestamps.

    The conditional intensity is:

        lambda(t) = mu + sum_{t_i < t} alpha * beta * exp(-beta * (t - t_i))

    Args:
        event_times: Array-like of event timestamps as floats
            (e.g. seconds since epoch, or days since start).
        kernel: Triggering kernel type. Currently only
            ``"exponential"`` is supported.
        max_iter: Maximum iterations for the EM algorithm.

    Returns:
        A :class:`HawkesResult` with background rate, triggering
        kernel parameters, branching ratio, and log-likelihood.

    Raises:
        ImportError: If numpy or scipy is not installed.
        ValueError: If fewer than 3 events are provided.
    """
    if kernel != "exponential":
        msg = f"Only 'exponential' kernel is supported, got {kernel!r}"
        raise ValueError(msg)

    try:
        import numpy as np
    except ImportError as exc:
        msg = "numpy is required for fit_hawkes_process(). Install with: pip install nyc311[stats]"
        raise ImportError(msg) from exc

    times = np.sort(np.asarray(event_times, dtype=float))
    n = len(times)

    if n < 3:
        msg = "Need at least 3 events to fit a Hawkes process."
        raise ValueError(msg)

    t_max = times[-1] - times[0]
    times = times - times[0]

    mu = n / (2.0 * t_max)
    alpha = 0.1
    beta_param = 1.0

    for _ in range(max_iter):
        intensities = np.full(n, mu)
        for i in range(1, n):
            dt = times[i] - times[:i]
            intensities[i] += alpha * beta_param * np.sum(np.exp(-beta_param * dt))

        p = np.zeros((n, n))
        for i in range(1, n):
            dt = times[i] - times[:i]
            trigger = alpha * beta_param * np.exp(-beta_param * dt)
            total = mu + np.sum(trigger)
            if total > 0:
                p[i, :i] = trigger / total

        n_background = sum(
            mu
            / (
                mu
                + alpha
                * beta_param
                * np.sum(np.exp(-beta_param * (times[i] - times[:i])))
            )
            if i > 0
            else 1.0
            for i in range(n)
        )

        mu_new = n_background / t_max

        n_triggered = n - n_background
        alpha_new = n_triggered / n if n > 0 else 0.0

        if n_triggered > 0:
            weighted_dt_sum = 0.0
            for i in range(1, n):
                dt = times[i] - times[:i]
                weights = p[i, :i]
                weighted_dt_sum += np.sum(weights * dt)
            beta_new = (
                n_triggered / weighted_dt_sum if weighted_dt_sum > 0 else beta_param
            )
        else:
            beta_new = beta_param

        if (
            abs(mu_new - mu) < 1e-8
            and abs(alpha_new - alpha) < 1e-8
            and abs(beta_new - beta_param) < 1e-8
        ):
            mu, alpha, beta_param = mu_new, alpha_new, beta_new
            break

        mu, alpha, beta_param = mu_new, alpha_new, beta_new

    ll = 0.0
    for i in range(n):
        lam_i = mu
        if i > 0:
            dt = times[i] - times[:i]
            lam_i += alpha * beta_param * float(np.sum(np.exp(-beta_param * dt)))
        ll += np.log(max(lam_i, 1e-10))
    ll -= mu * t_max
    for i in range(n):
        ll += alpha * (np.exp(-beta_param * (t_max - times[i])) - 1.0)

    branching = alpha / beta_param if beta_param > 0 else float("inf")

    summary = (
        f"Hawkes Process: {n} events over {t_max:.1f} time units\n"
        f"Background rate (mu): {mu:.4f}\n"
        f"Triggering: alpha={alpha:.4f}, beta={beta_param:.4f}\n"
        f"Branching ratio: {branching:.4f}\n"
        f"Log-likelihood: {ll:.2f}"
    )

    return HawkesResult(
        background_rate=float(mu),
        triggering_kernel_alpha=float(alpha),
        triggering_kernel_beta=float(beta_param),
        branching_ratio=float(branching),
        n_events=n,
        log_likelihood=float(ll),
        model_summary=summary,
    )

interrupted_time_series ¶

interrupted_time_series(
    series: Any,
    intervention_date: date,
    *,
    covariates: Any | None = None,
) -> ITSResult

Fit a segmented interrupted-time-series regression.

Estimates pre-intervention level and trend, the immediate level change at intervention_date, and the post-intervention trend change, following the standard ITS regression specification.

Parameters:

Name	Type	Description	Default
`series`	`Any`	A `pandas.Series` indexed by a `DatetimeIndex` containing the outcome to model.	required
`intervention_date`	`date`	The date the intervention began. Observations on or after this date are treated as post-intervention.	required
`covariates`	`Any \| None`	Optional `pandas.DataFrame` of exogenous regressors aligned to `series`. Each column is added to the design matrix.	`None`

Returns:

Name	Type	Description
`An`	`ITSResult`	class:`ITSResult` with pre/post trends, the level and trend
	`ITSResult`	changes at `intervention_date`, p-values for the level and
	`ITSResult`	trend coefficients, and the full model summary string.

Raises:

Type	Description
`ImportError`	If statsmodels or pandas is not installed. Install the optional stats extra with `pip install nyc311[stats]`.
`TypeError`	If `series` does not use a `DatetimeIndex`.

Source code in src/nyc311/stats/_its.py

def interrupted_time_series(
    series: Any,
    intervention_date: date,
    *,
    covariates: Any | None = None,
) -> ITSResult:
    """Fit a segmented interrupted-time-series regression.

    Estimates pre-intervention level and trend, the immediate level
    change at ``intervention_date``, and the post-intervention trend
    change, following the standard ITS regression specification.

    Args:
        series: A ``pandas.Series`` indexed by a ``DatetimeIndex``
            containing the outcome to model.
        intervention_date: The date the intervention began. Observations
            on or after this date are treated as post-intervention.
        covariates: Optional ``pandas.DataFrame`` of exogenous regressors
            aligned to ``series``. Each column is added to the design
            matrix.

    Returns:
        An :class:`ITSResult` with pre/post trends, the level and trend
        changes at ``intervention_date``, p-values for the level and
        trend coefficients, and the full model summary string.

    Raises:
        ImportError: If statsmodels or pandas is not installed. Install
            the optional stats extra with ``pip install nyc311[stats]``.
        TypeError: If ``series`` does not use a ``DatetimeIndex``.
    """
    try:
        import numpy as np
        import pandas as pd
        from statsmodels.regression.linear_model import OLS
        from statsmodels.tools import add_constant
    except ImportError as exc:
        message = (
            "statsmodels and pandas are required for interrupted_time_series(). "
            "Install with: pip install nyc311[stats]"
        )
        raise ImportError(message) from exc

    if not isinstance(series.index, pd.DatetimeIndex):
        msg = "series must have a DatetimeIndex."
        raise TypeError(msg)

    df = pd.DataFrame({"y": series})
    df["time"] = np.arange(len(df))
    df["intervention"] = (df.index >= pd.Timestamp(intervention_date)).astype(int)
    df["time_after"] = df["time"] * df["intervention"]

    exog_cols = ["time", "intervention", "time_after"]
    if covariates is not None:
        for col in covariates.columns:
            df[col] = covariates[col].to_numpy()
            exog_cols.append(col)

    exog = add_constant(df[exog_cols])
    model = OLS(df["y"], exog, missing="drop").fit()

    pre_trend = float(model.params["time"])
    trend_change = float(model.params["time_after"])
    post_trend = pre_trend + trend_change
    level_change = float(model.params["intervention"])
    p_level = float(model.pvalues["intervention"])
    p_trend = float(model.pvalues["time_after"])

    return ITSResult(
        pre_trend=pre_trend,
        post_trend=post_trend,
        level_change=level_change,
        trend_change=trend_change,
        p_value_level=p_level,
        p_value_trend=p_trend,
        model_summary=str(model.summary()),
    )

panel_fixed_effects ¶

panel_fixed_effects(
    panel: PanelDataset,
    outcome: str,
    regressors: tuple[str, ...],
    *,
    time_effects: bool = False,
    cluster: Literal["entity", "time", "both"] = "entity",
) -> PanelRegressionResult

Estimate a fixed-effects panel regression.

Wraps :class:linearmodels.panel.PanelOLS with entity fixed effects by default and optional two-way fixed effects.

Parameters:

Name	Type	Description	Default
`panel`	`PanelDataset`	A :class:`~nyc311.temporal.PanelDataset` providing the data, entities, and periods.	required
`outcome`	`str`	Name of the dependent variable column.	required
`regressors`	`tuple[str, ...]`	Names of independent variable columns.	required
`time_effects`	`bool`	When `True`, include time fixed effects in addition to entity fixed effects (two-way FE).	`False`
`cluster`	`Literal['entity', 'time', 'both']`	Cluster standard errors by `"entity"` (default), `"time"`, or `"both"`.	`'entity'`

Returns:

Name	Type	Description
`A`	`PanelRegressionResult`	class:`PanelRegressionResult` with coefficients, standard
	`PanelRegressionResult`	errors, p-values, R-squared, observation counts, and the full
	`PanelRegressionResult`	`linearmodels` summary string.

Raises:

Type	Description
`ImportError`	If `linearmodels` or pandas is not installed. Install the optional stats extra with `pip install nyc311[stats]`.
`ValueError`	If `outcome` or any of `regressors` is missing from the panel.

Source code in src/nyc311/stats/_panel_models.py

def panel_fixed_effects(
    panel: PanelDataset,
    outcome: str,
    regressors: tuple[str, ...],
    *,
    time_effects: bool = False,
    cluster: Literal["entity", "time", "both"] = "entity",
) -> PanelRegressionResult:
    """Estimate a fixed-effects panel regression.

    Wraps :class:`linearmodels.panel.PanelOLS` with entity fixed effects
    by default and optional two-way fixed effects.

    Args:
        panel: A :class:`~nyc311.temporal.PanelDataset` providing the
            data, entities, and periods.
        outcome: Name of the dependent variable column.
        regressors: Names of independent variable columns.
        time_effects: When ``True``, include time fixed effects in
            addition to entity fixed effects (two-way FE).
        cluster: Cluster standard errors by ``"entity"`` (default),
            ``"time"``, or ``"both"``.

    Returns:
        A :class:`PanelRegressionResult` with coefficients, standard
        errors, p-values, R-squared, observation counts, and the full
        ``linearmodels`` summary string.

    Raises:
        ImportError: If ``linearmodels`` or pandas is not installed.
            Install the optional stats extra with
            ``pip install nyc311[stats]``.
        ValueError: If ``outcome`` or any of ``regressors`` is missing
            from the panel.
    """
    try:
        from linearmodels.panel import PanelOLS
    except ImportError as exc:
        message = (
            "linearmodels is required for panel regressions. "
            "Install with: pip install nyc311[stats]"
        )
        raise ImportError(message) from exc

    df = _prepare_panel_data(panel, outcome, regressors)
    y = df[outcome]
    x = df[list(regressors)]

    cov_type_map = {
        "entity": "clustered",
        "time": "clustered",
        "both": "clustered",
    }
    cluster_entity = cluster in ("entity", "both")
    cluster_time = cluster in ("time", "both")

    model = PanelOLS(
        y,
        x,
        entity_effects=True,
        time_effects=time_effects,
    )
    result = model.fit(
        cov_type=cov_type_map[cluster],
        cluster_entity=cluster_entity,
        cluster_time=cluster_time,
    )

    return PanelRegressionResult(
        method="two_way_fe" if time_effects else "entity_fe",
        coefficients={str(k): float(v) for k, v in result.params.items()},
        std_errors={str(k): float(v) for k, v in result.std_errors.items()},
        p_values={str(k): float(v) for k, v in result.pvalues.items()},
        r_squared=float(result.rsquared),
        n_observations=int(result.nobs),
        n_entities=int(result.entity_info.total),
        n_periods=int(result.time_info.total),
        model_summary=str(result.summary),
    )

panel_random_effects ¶

panel_random_effects(
    panel: PanelDataset,
    outcome: str,
    regressors: tuple[str, ...],
) -> PanelRegressionResult

Estimate a random-effects panel regression.

Wraps :class:linearmodels.panel.RandomEffects.

Parameters:

Name	Type	Description	Default
`panel`	`PanelDataset`	A :class:`~nyc311.temporal.PanelDataset` providing the data, entities, and periods.	required
`outcome`	`str`	Name of the dependent variable column.	required
`regressors`	`tuple[str, ...]`	Names of independent variable columns.	required

Returns:

Name	Type	Description
`A`	`PanelRegressionResult`	class:`PanelRegressionResult` with coefficients, standard
	`PanelRegressionResult`	errors, p-values, R-squared, observation counts, and the full
	`PanelRegressionResult`	`linearmodels` summary string.

Raises:

Type	Description
`ImportError`	If `linearmodels` or pandas is not installed. Install the optional stats extra with `pip install nyc311[stats]`.
`ValueError`	If `outcome` or any of `regressors` is missing from the panel.

Source code in src/nyc311/stats/_panel_models.py

def panel_random_effects(
    panel: PanelDataset,
    outcome: str,
    regressors: tuple[str, ...],
) -> PanelRegressionResult:
    """Estimate a random-effects panel regression.

    Wraps :class:`linearmodels.panel.RandomEffects`.

    Args:
        panel: A :class:`~nyc311.temporal.PanelDataset` providing the
            data, entities, and periods.
        outcome: Name of the dependent variable column.
        regressors: Names of independent variable columns.

    Returns:
        A :class:`PanelRegressionResult` with coefficients, standard
        errors, p-values, R-squared, observation counts, and the full
        ``linearmodels`` summary string.

    Raises:
        ImportError: If ``linearmodels`` or pandas is not installed.
            Install the optional stats extra with
            ``pip install nyc311[stats]``.
        ValueError: If ``outcome`` or any of ``regressors`` is missing
            from the panel.
    """
    try:
        from linearmodels.panel import RandomEffects
    except ImportError as exc:
        message = (
            "linearmodels is required for panel regressions. "
            "Install with: pip install nyc311[stats]"
        )
        raise ImportError(message) from exc

    df = _prepare_panel_data(panel, outcome, regressors)
    y = df[outcome]
    x = df[list(regressors)]

    model = RandomEffects(y, x)
    result = model.fit()

    return PanelRegressionResult(
        method="random_effects",
        coefficients={str(k): float(v) for k, v in result.params.items()},
        std_errors={str(k): float(v) for k, v in result.std_errors.items()},
        p_values={str(k): float(v) for k, v in result.pvalues.items()},
        r_squared=float(result.rsquared),
        n_observations=int(result.nobs),
        n_entities=int(result.entity_info.total),
        n_periods=int(result.time_info.total),
        model_summary=str(result.summary),
    )

minimum_detectable_effect ¶

minimum_detectable_effect(
    n_units: int,
    n_periods: int,
    *,
    icc: float = 0.05,
    alpha: float = 0.05,
    power: float = 0.8,
    proportion_treated: float = 0.5,
    outcome_variance: float = 1.0,
    r_squared: float = 0.0,
) -> PowerResult

Compute the minimum detectable effect for a panel experiment.

Uses the standard cluster-RCT MDE formula:

MDE = (z_{alpha/2} + z_{beta}) * sqrt(2 * sigma^2 * DE / (N_t * T))

where DE = 1 + (T - 1) * ICC is the design effect.

Parameters:

Name	Type	Description	Default
`n_units`	`int`	Total number of geographic units (clusters).	required
`n_periods`	`int`	Number of time periods observed.	required
`icc`	`float`	Intra-cluster correlation coefficient. Defaults to `0.05`.	`0.05`
`alpha`	`float`	Significance level. Defaults to `0.05`.	`0.05`
`power`	`float`	Statistical power (1 - beta). Defaults to `0.80`.	`0.8`
`proportion_treated`	`float`	Fraction of units assigned to treatment. Defaults to `0.5`.	`0.5`
`outcome_variance`	`float`	Variance of the outcome variable. Defaults to `1.0`.	`1.0`
`r_squared`	`float`	Proportion of variance explained by covariates. Defaults to `0.0` (no covariates).	`0.0`

Returns:

Name	Type	Description
`A`	`PowerResult`	class:`PowerResult` with the computed MDE and all design
	`PowerResult`	parameters.

Raises:

Type	Description
`ImportError`	If scipy is not installed. Install with `pip install nyc311[stats]`.
`ValueError`	If any parameter is out of its valid range.

Source code in src/nyc311/stats/_power.py

def minimum_detectable_effect(
    n_units: int,
    n_periods: int,
    *,
    icc: float = 0.05,
    alpha: float = 0.05,
    power: float = 0.80,
    proportion_treated: float = 0.5,
    outcome_variance: float = 1.0,
    r_squared: float = 0.0,
) -> PowerResult:
    """Compute the minimum detectable effect for a panel experiment.

    Uses the standard cluster-RCT MDE formula:

        MDE = (z_{alpha/2} + z_{beta}) * sqrt(2 * sigma^2 * DE / (N_t * T))

    where DE = 1 + (T - 1) * ICC is the design effect.

    Args:
        n_units: Total number of geographic units (clusters).
        n_periods: Number of time periods observed.
        icc: Intra-cluster correlation coefficient.  Defaults to
            ``0.05``.
        alpha: Significance level.  Defaults to ``0.05``.
        power: Statistical power (1 - beta).  Defaults to ``0.80``.
        proportion_treated: Fraction of units assigned to treatment.
            Defaults to ``0.5``.
        outcome_variance: Variance of the outcome variable.  Defaults
            to ``1.0``.
        r_squared: Proportion of variance explained by covariates.
            Defaults to ``0.0`` (no covariates).

    Returns:
        A :class:`PowerResult` with the computed MDE and all design
        parameters.

    Raises:
        ImportError: If scipy is not installed.  Install with
            ``pip install nyc311[stats]``.
        ValueError: If any parameter is out of its valid range.
    """
    try:
        from scipy.stats import norm
    except ImportError as exc:
        msg = "scipy is required for minimum_detectable_effect(). Install with: pip install nyc311[stats]"
        raise ImportError(msg) from exc

    if n_units < 2:
        msg = "n_units must be at least 2."
        raise ValueError(msg)
    if n_periods < 1:
        msg = "n_periods must be at least 1."
        raise ValueError(msg)
    if not 0.0 < proportion_treated < 1.0:
        msg = "proportion_treated must be in (0, 1)."
        raise ValueError(msg)

    z_alpha = float(norm.ppf(1.0 - alpha / 2.0))
    z_beta = float(norm.ppf(power))

    design_effect = 1.0 + (n_periods - 1) * icc
    n_treated = n_units * proportion_treated
    adjusted_var = outcome_variance * (1.0 - r_squared)

    mde = (z_alpha + z_beta) * (
        (2.0 * adjusted_var * design_effect / (n_treated * n_periods)) ** 0.5
    )

    return PowerResult(
        mde=float(mde),
        alpha=alpha,
        power=power,
        n_units=n_units,
        n_periods=n_periods,
        icc=icc,
        variance_explained=r_squared,
    )

regression_discontinuity ¶

regression_discontinuity(
    running_variable: Any,
    outcome: Any,
    cutoff: float = 0.0,
    *,
    kernel: str = "triangular",
    bandwidth: float | None = None,
    polynomial_order: int = 1,
) -> RDResult

Estimate a local treatment effect at a sharp cutoff.

Fits local polynomials on each side of the cutoff, using the Imbens-Kalyanaraman (IK) or Calonico-Cattaneo-Titiunik (CCT) bandwidth selector when bandwidth is None.

Parameters:

Name	Type	Description	Default
`running_variable`	`Any`	Array-like running (assignment) variable.	required
`outcome`	`Any`	Array-like outcome variable of the same length.	required
`cutoff`	`float`	The threshold value of the running variable. Defaults to `0.0`.	`0.0`
`kernel`	`str`	Kernel for local weighting. One of `"triangular"` (default), `"epanechnikov"`, or `"uniform"`.	`'triangular'`
`bandwidth`	`float \| None`	Bandwidth for the local polynomial fit. When `None`, an optimal bandwidth is selected automatically.	`None`
`polynomial_order`	`int`	Degree of the local polynomial. Defaults to `1` (local linear).	`1`

Returns:

Name	Type	Description
`An`	`RDResult`	class:`RDResult` with the treatment effect estimate,
	`RDResult`	robust standard error, bias-corrected confidence interval,
	`RDResult`	effective sample sizes, and bandwidth.

Raises:

Type	Description
`ImportError`	If numpy or scipy is not installed.
`ValueError`	If arrays are mismatched or too few observations exist on either side.

Source code in src/nyc311/stats/_rdd.py

def regression_discontinuity(
    running_variable: Any,
    outcome: Any,
    cutoff: float = 0.0,
    *,
    kernel: str = "triangular",
    bandwidth: float | None = None,
    polynomial_order: int = 1,
) -> RDResult:
    """Estimate a local treatment effect at a sharp cutoff.

    Fits local polynomials on each side of the cutoff, using the
    Imbens-Kalyanaraman (IK) or Calonico-Cattaneo-Titiunik (CCT)
    bandwidth selector when ``bandwidth`` is ``None``.

    Args:
        running_variable: Array-like running (assignment) variable.
        outcome: Array-like outcome variable of the same length.
        cutoff: The threshold value of the running variable.
            Defaults to ``0.0``.
        kernel: Kernel for local weighting. One of ``"triangular"``
            (default), ``"epanechnikov"``, or ``"uniform"``.
        bandwidth: Bandwidth for the local polynomial fit. When
            ``None``, an optimal bandwidth is selected automatically.
        polynomial_order: Degree of the local polynomial.
            Defaults to ``1`` (local linear).

    Returns:
        An :class:`RDResult` with the treatment effect estimate,
        robust standard error, bias-corrected confidence interval,
        effective sample sizes, and bandwidth.

    Raises:
        ImportError: If numpy or scipy is not installed.
        ValueError: If arrays are mismatched or too few observations
            exist on either side.
    """
    try:
        import numpy as np
        from scipy.stats import norm
    except ImportError as exc:
        msg = (
            "numpy and scipy are required for regression_discontinuity(). "
            "Install with: pip install nyc311[stats]"
        )
        raise ImportError(msg) from exc

    x = np.asarray(running_variable, dtype=float)
    y = np.asarray(outcome, dtype=float)

    if len(x) != len(y):
        msg = "running_variable and outcome must have the same length."
        raise ValueError(msg)

    x_centered = x - cutoff
    left_mask = x_centered < 0
    right_mask = x_centered >= 0

    if left_mask.sum() < 3 or right_mask.sum() < 3:
        msg = "Need at least 3 observations on each side of the cutoff."
        raise ValueError(msg)

    if bandwidth is None:
        bandwidth = _ik_bandwidth(x_centered, y)

    bw_left = bandwidth
    bw_right = bandwidth

    left_bw_mask = left_mask & (x_centered >= -bw_left)
    right_bw_mask = right_mask & (x_centered <= bw_right)

    n_left = int(left_bw_mask.sum())
    n_right = int(right_bw_mask.sum())

    if n_left < 2 or n_right < 2:
        msg = "Too few observations within bandwidth."
        raise ValueError(msg)

    def _kernel_weights(u: Any) -> Any:
        u_abs = np.abs(u)
        if kernel == "triangular":
            return np.maximum(1.0 - u_abs, 0.0)
        if kernel == "epanechnikov":
            return np.maximum(0.75 * (1.0 - u_abs**2), 0.0)
        return np.ones_like(u_abs)

    x_left = x_centered[left_bw_mask]
    y_left = y[left_bw_mask]
    w_left = _kernel_weights(x_left / bw_left)

    x_right = x_centered[right_bw_mask]
    y_right = y[right_bw_mask]
    w_right = _kernel_weights(x_right / bw_right)

    def _wls_fit(xv: Any, yv: Any, wv: Any, order: int) -> tuple[Any, Any]:
        design = np.column_stack([xv**p for p in range(order + 1)])
        wm = np.diag(wv)
        xtw = design.T @ wm
        beta = np.linalg.solve(xtw @ design, xtw @ yv)
        resid = yv - design @ beta
        bread = np.linalg.inv(xtw @ design)
        meat = design.T @ np.diag((wv * resid) ** 2) @ design
        vcov = bread @ meat @ bread
        return beta, vcov

    beta_left, vcov_left = _wls_fit(x_left, y_left, w_left, polynomial_order)
    beta_right, vcov_right = _wls_fit(x_right, y_right, w_right, polynomial_order)

    tau = float(beta_right[0] - beta_left[0])
    se = float(np.sqrt(vcov_left[0, 0] + vcov_right[0, 0]))
    se = max(se, 1e-10)

    z = tau / se
    p_value = float(2.0 * (1.0 - norm.cdf(abs(z))))
    ci_lo = tau - 1.96 * se
    ci_hi = tau + 1.96 * se

    summary = (
        f"RD Estimate: {tau:.4f} (SE={se:.4f}, p={p_value:.4f})\n"
        f"Bandwidth: [{bw_left:.4f}, {bw_right:.4f}]\n"
        f"Effective N: {n_left} (left), {n_right} (right)\n"
        f"Kernel: {kernel}, Polynomial order: {polynomial_order}"
    )

    return RDResult(
        treatment_effect=tau,
        se_robust=se,
        p_value=p_value,
        ci_lower=ci_lo,
        ci_upper=ci_hi,
        bandwidth_left=bw_left,
        bandwidth_right=bw_right,
        n_effective_left=n_left,
        n_effective_right=n_right,
        kernel=kernel,
        model_summary=summary,
    )

latent_reporting_bias_em ¶

latent_reporting_bias_em(
    complaint_counts: dict[str, int],
    populations: dict[str, int],
    covariates: dict[str, dict[str, float]] | None = None,
    *,
    max_iter: int = 200,
    tol: float = 1e-06,
) -> LatentReportingResult

Estimate true complaint rates via expectation-maximization.

Models observed counts as a product of a latent true rate and a reporting probability. The EM algorithm iterates between estimating true rates (M-step, Poisson MLE) and reporting probabilities (M-step, logistic on covariates).

Parameters:

Name	Type	Description	Default
`complaint_counts`	`dict[str, int]`	Mapping `{unit_id: observed_count}`.	required
`populations`	`dict[str, int]`	Mapping `{unit_id: population}`.	required
`covariates`	`dict[str, dict[str, float]] \| None`	Optional mapping `{unit_id: {covariate_name: value}}`. When `None`, a uniform reporting probability is assumed.	`None`
`max_iter`	`int`	Maximum EM iterations.	`200`
`tol`	`float`	Convergence tolerance on log-likelihood change.	`1e-06`

Returns:

Name	Type	Description
`A`	`LatentReportingResult`	class:`LatentReportingResult` with estimated true rates,
	`LatentReportingResult`	reporting probabilities, and convergence diagnostics.

Raises:

Type	Description
`ImportError`	If numpy or scipy is not installed.

Source code in src/nyc311/stats/_reporting_bias.py

def latent_reporting_bias_em(
    complaint_counts: dict[str, int],
    populations: dict[str, int],
    covariates: dict[str, dict[str, float]] | None = None,
    *,
    max_iter: int = 200,
    tol: float = 1e-6,
) -> LatentReportingResult:
    """Estimate true complaint rates via expectation-maximization.

    Models observed counts as a product of a latent true rate and a
    reporting probability.  The EM algorithm iterates between
    estimating true rates (M-step, Poisson MLE) and reporting
    probabilities (M-step, logistic on covariates).

    Args:
        complaint_counts: Mapping ``{unit_id: observed_count}``.
        populations: Mapping ``{unit_id: population}``.
        covariates: Optional mapping
            ``{unit_id: {covariate_name: value}}``.  When ``None``,
            a uniform reporting probability is assumed.
        max_iter: Maximum EM iterations.
        tol: Convergence tolerance on log-likelihood change.

    Returns:
        A :class:`LatentReportingResult` with estimated true rates,
        reporting probabilities, and convergence diagnostics.

    Raises:
        ImportError: If numpy or scipy is not installed.
    """
    try:
        import numpy as np
        from scipy.special import expit
    except ImportError as exc:
        msg = (
            "numpy and scipy are required for latent_reporting_bias_em(). "
            "Install with: pip install nyc311[stats]"
        )
        raise ImportError(msg) from exc

    unit_ids = sorted(complaint_counts)
    n = len(unit_ids)

    y = np.array([complaint_counts[uid] for uid in unit_ids], dtype=float)
    pop = np.array([populations[uid] for uid in unit_ids], dtype=float)

    observed_rates = y / np.maximum(pop, 1.0)

    lambda_hat = observed_rates.copy() + 1e-8
    rho_hat = np.full(n, 0.5)

    if covariates is not None:
        cov_names = sorted(next(iter(covariates.values())).keys())
        x = np.column_stack(
            [
                np.array([covariates[uid][c] for uid in unit_ids], dtype=float)
                for c in cov_names
            ]
        )
        x = np.column_stack([np.ones(n), x])
        beta = np.zeros(x.shape[1])
    else:
        x = None
        beta = None

    ll_trace: list[float] = []
    converged = False

    for _iteration in range(max_iter):
        expected_true = y / np.maximum(rho_hat, 1e-10)
        lambda_hat = expected_true / np.maximum(pop, 1.0)
        lambda_hat = np.maximum(lambda_hat, 1e-10)

        if x is not None and beta is not None:
            for _ in range(5):
                rho_pred = expit(x @ beta)
                residual = (y / np.maximum(lambda_hat * pop, 1e-10)) - rho_pred
                grad = x.T @ residual
                hess = -x.T @ (np.diag(rho_pred * (1 - rho_pred)) @ x)
                try:
                    step = np.linalg.solve(hess, grad)
                    beta = beta - step
                except np.linalg.LinAlgError:
                    break
            rho_hat = expit(x @ beta)
        else:
            rho_hat = np.clip(y / np.maximum(lambda_hat * pop, 1e-10), 0.01, 0.99)

        ll = float(
            np.sum(
                y * np.log(np.maximum(lambda_hat * pop * rho_hat, 1e-10))
                - lambda_hat * pop * rho_hat
            )
        )
        ll_trace.append(ll)

        if len(ll_trace) > 1 and abs(ll_trace[-1] - ll_trace[-2]) < tol:
            converged = True
            break

    return LatentReportingResult(
        estimated_true_rates={
            uid: float(lambda_hat[i]) for i, uid in enumerate(unit_ids)
        },
        reporting_probabilities={
            uid: float(rho_hat[i]) for i, uid in enumerate(unit_ids)
        },
        observed_rates={
            uid: float(observed_rates[i]) for i, uid in enumerate(unit_ids)
        },
        n_iterations=len(ll_trace),
        converged=converged,
        log_likelihood_trace=tuple(ll_trace),
    )

reporting_rate_adjustment ¶

reporting_rate_adjustment(
    panel: PanelDataset,
    outcome: str,
    demographic_covariates: tuple[str, ...],
) -> ReportingAdjustmentResult

Adjust complaint rates for neighborhood reporting propensity.

Fits a mixed-effects model with unit random intercepts:

outcome ~ covariates + (1 | unit_id)

The random intercepts capture unit-level reporting propensity after controlling for demographic covariates.

Parameters:

Name	Type	Description	Default
`panel`	`PanelDataset`	A :class:`PanelDataset` with covariates attached.	required
`outcome`	`str`	Column name for the complaint rate to adjust.	required
`demographic_covariates`	`tuple[str, ...]`	Column names for demographic controls (e.g. median income, population density).	required

Returns:

Name	Type	Description
`A`	`ReportingAdjustmentResult`	class:`ReportingAdjustmentResult` with raw and adjusted
	`ReportingAdjustmentResult`	rates, random intercepts, ICC, and model summary.

Raises:

Type	Description
`ImportError`	If statsmodels or pandas is not installed.

Source code in src/nyc311/stats/_reporting_bias.py

def reporting_rate_adjustment(
    panel: PanelDataset,
    outcome: str,
    demographic_covariates: tuple[str, ...],
) -> ReportingAdjustmentResult:
    """Adjust complaint rates for neighborhood reporting propensity.

    Fits a mixed-effects model with unit random intercepts:

        outcome ~ covariates + (1 | unit_id)

    The random intercepts capture unit-level reporting propensity
    after controlling for demographic covariates.

    Args:
        panel: A :class:`PanelDataset` with covariates attached.
        outcome: Column name for the complaint rate to adjust.
        demographic_covariates: Column names for demographic controls
            (e.g. median income, population density).

    Returns:
        A :class:`ReportingAdjustmentResult` with raw and adjusted
        rates, random intercepts, ICC, and model summary.

    Raises:
        ImportError: If statsmodels or pandas is not installed.
    """
    try:
        from statsmodels.regression.mixed_linear_model import MixedLM
    except ImportError as exc:
        msg = (
            "statsmodels and pandas are required for "
            "reporting_rate_adjustment(). "
            "Install with: pip install nyc311[stats]"
        )
        raise ImportError(msg) from exc

    df = panel.to_dataframe()
    df = df.reset_index()

    formula_parts = [outcome, "~", " + ".join(demographic_covariates)]
    formula = " ".join(formula_parts)

    model = MixedLM.from_formula(
        formula,
        groups="unit_id",
        data=df,
    )
    result = model.fit(reml=True)  # pylint: disable=unexpected-keyword-arg

    unit_ids = sorted(df["unit_id"].unique())
    raw_rates: dict[str, float] = {}
    for uid in unit_ids:
        mask = df["unit_id"] == uid
        raw_rates[uid] = float(df.loc[mask, outcome].mean())

    re = result.random_effects
    adjustment_factors: dict[str, float] = {}
    for uid in unit_ids:
        adjustment_factors[uid] = float(re[uid].iloc[0]) if uid in re else 0.0

    group_var = (
        float(result.cov_re.iloc[0, 0])
        if hasattr(result.cov_re, "iloc")
        else float(result.cov_re)
    )
    resid_var = float(result.scale)
    icc = group_var / (group_var + resid_var) if (group_var + resid_var) > 0 else 0.0

    adjusted_rates: dict[str, float] = {}
    for uid in unit_ids:
        adjusted_rates[uid] = raw_rates[uid] - adjustment_factors[uid]

    return ReportingAdjustmentResult(
        raw_rates=raw_rates,
        adjusted_rates=adjusted_rates,
        adjustment_factors=adjustment_factors,
        covariates_used=demographic_covariates,
        icc=icc,
        model_summary=str(result.summary()),
    )

global_morans_i ¶

global_morans_i(
    values: dict[str, float],
    weights: dict[str, dict[str, float]],
) -> MoranResult

Compute Global Moran's I for values under spatial weights.

Parameters:

Name	Type	Description	Default
`values`	`dict[str, float]`	Mapping `{unit_id: numeric_value}` to test for spatial autocorrelation. Unit IDs must align with those in `weights`.	required
`weights`	`dict[str, dict[str, float]]`	Nested dict `{unit_a: {unit_b: weight}}` describing the spatial weights matrix; typically row-standardized.	required

Returns:

Name	Type	Description
`A`	`MoranResult`	class:`MoranResult` with the Moran's I statistic, the
	`MoranResult`	permutation-based p-value, the standardized z-score, and the
	`MoranResult`	expected value under the null hypothesis.

Raises:

Type	Description
`ImportError`	If `esda` or `libpysal` is not installed. Install the optional stats extra with `pip install nyc311[stats]`.

Source code in src/nyc311/stats/_spatial.py

def global_morans_i(
    values: dict[str, float],
    weights: dict[str, dict[str, float]],
) -> MoranResult:
    """Compute Global Moran's I for ``values`` under spatial ``weights``.

    Args:
        values: Mapping ``{unit_id: numeric_value}`` to test for spatial
            autocorrelation. Unit IDs must align with those in
            ``weights``.
        weights: Nested dict ``{unit_a: {unit_b: weight}}`` describing
            the spatial weights matrix; typically row-standardized.

    Returns:
        A :class:`MoranResult` with the Moran's I statistic, the
        permutation-based p-value, the standardized z-score, and the
        expected value under the null hypothesis.

    Raises:
        ImportError: If ``esda`` or ``libpysal`` is not installed.
            Install the optional stats extra with
            ``pip install nyc311[stats]``.
    """
    try:
        import numpy as np
        from esda.moran import Moran
        from libpysal.weights import W
    except ImportError as exc:
        message = (
            "esda and libpysal are required for spatial autocorrelation. "
            "Install with: pip install nyc311[stats]"
        )
        raise ImportError(message) from exc

    unit_ids = sorted(values)
    y = np.array([values[uid] for uid in unit_ids])

    neighbors = {uid: list(weights.get(uid, {}).keys()) for uid in unit_ids}
    weight_vals = {uid: list(weights.get(uid, {}).values()) for uid in unit_ids}
    w = W(neighbors, weight_vals)

    mi = Moran(y, w)
    return MoranResult(
        statistic=float(mi.I),
        p_value=float(mi.p_sim),
        z_score=float(mi.z_sim),
        expected=float(mi.EI),
    )

local_morans_i ¶

local_morans_i(
    values: dict[str, float],
    weights: dict[str, dict[str, float]],
    *,
    permutations: int = 999,
) -> LISAResult

Compute Local Moran's I (LISA) for hotspot/coldspot identification.

Parameters:

Name	Type	Description	Default
`values`	`dict[str, float]`	Mapping `{unit_id: numeric_value}` for the variable being tested.	required
`weights`	`dict[str, dict[str, float]]`	Nested dict `{unit_a: {unit_b: weight}}` describing the spatial weights matrix.	required
`permutations`	`int`	Number of conditional permutations used to derive pseudo p-values.	`999`

Returns:

Name	Type	Description
`A`	`LISAResult`	class:`LISAResult` containing the local statistic, pseudo
	`LISAResult`	p-values, and quadrant cluster labels (`"HH"`, `"LH"`,
	`LISAResult`	`"LL"`, `"HL"`, or `"ns"` for non-significant) per unit.

Raises:

Type	Description
`ImportError`	If `esda` or `libpysal` is not installed. Install the optional stats extra with `pip install nyc311[stats]`.

Source code in src/nyc311/stats/_spatial.py

def local_morans_i(
    values: dict[str, float],
    weights: dict[str, dict[str, float]],
    *,
    permutations: int = 999,
) -> LISAResult:
    """Compute Local Moran's I (LISA) for hotspot/coldspot identification.

    Args:
        values: Mapping ``{unit_id: numeric_value}`` for the variable
            being tested.
        weights: Nested dict ``{unit_a: {unit_b: weight}}`` describing
            the spatial weights matrix.
        permutations: Number of conditional permutations used to derive
            pseudo p-values.

    Returns:
        A :class:`LISAResult` containing the local statistic, pseudo
        p-values, and quadrant cluster labels (``"HH"``, ``"LH"``,
        ``"LL"``, ``"HL"``, or ``"ns"`` for non-significant) per unit.

    Raises:
        ImportError: If ``esda`` or ``libpysal`` is not installed.
            Install the optional stats extra with
            ``pip install nyc311[stats]``.
    """
    try:
        import numpy as np
        from esda.moran import Moran_Local
        from libpysal.weights import W
    except ImportError as exc:
        message = (
            "esda and libpysal are required for LISA analysis. "
            "Install with: pip install nyc311[stats]"
        )
        raise ImportError(message) from exc

    unit_ids = sorted(values)
    y = np.array([values[uid] for uid in unit_ids])

    neighbors = {uid: list(weights.get(uid, {}).keys()) for uid in unit_ids}
    weight_vals = {uid: list(weights.get(uid, {}).values()) for uid in unit_ids}
    w = W(neighbors, weight_vals)

    lisa = Moran_Local(y, w, permutations=permutations)

    labels: list[str] = []
    for i, quad in enumerate(lisa.q):
        if lisa.p_sim[i] < 0.05:
            labels.append(_LISA_QUAD_LABELS.get(int(quad), "ns"))
        else:
            labels.append("ns")

    return LISAResult(
        local_statistic=tuple(float(x) for x in lisa.Is),
        p_values=tuple(float(x) for x in lisa.p_sim),
        cluster_labels=tuple(labels),
        unit_ids=tuple(unit_ids),
    )

spatial_error_model ¶

spatial_error_model(
    panel: PanelDataset,
    weights: dict[str, dict[str, float]],
    outcome: str,
    regressors: tuple[str, ...],
    *,
    period: str | None = None,
) -> SpatialErrorResult

Fit a spatial error (SEM) model via maximum likelihood.

Estimates: y = X @ beta + u, u = lambda * W @ u + epsilon

Parameters:

Name	Type	Description	Default
`panel`	`PanelDataset`	A :class:`PanelDataset` containing the outcome and regressor columns.	required
`weights`	`dict[str, dict[str, float]]`	Nested dict `{unit_a: {unit_b: weight}}` of spatial weights (row-standardized).	required
`outcome`	`str`	Column name for the dependent variable.	required
`regressors`	`tuple[str, ...]`	Column names for the independent variables.	required
`period`	`str \| None`	If given, extract only this period as a cross-section. If `None`, collapse across periods via group means.	`None`

Returns:

Name	Type	Description
`A`	`SpatialErrorResult`	class:`SpatialErrorResult` with estimated coefficients, the
	`SpatialErrorResult`	spatial error parameter (lambda), and fit statistics.

Raises:

Type	Description
`ImportError`	If spreg or libpysal is not installed.

Source code in src/nyc311/stats/_spatial_regression.py

def spatial_error_model(
    panel: PanelDataset,
    weights: dict[str, dict[str, float]],
    outcome: str,
    regressors: tuple[str, ...],
    *,
    period: str | None = None,
) -> SpatialErrorResult:
    """Fit a spatial error (SEM) model via maximum likelihood.

    Estimates: y = X @ beta + u,  u = lambda * W @ u + epsilon

    Args:
        panel: A :class:`PanelDataset` containing the outcome and
            regressor columns.
        weights: Nested dict ``{unit_a: {unit_b: weight}}`` of spatial
            weights (row-standardized).
        outcome: Column name for the dependent variable.
        regressors: Column names for the independent variables.
        period: If given, extract only this period as a cross-section.
            If ``None``, collapse across periods via group means.

    Returns:
        A :class:`SpatialErrorResult` with estimated coefficients, the
        spatial error parameter (lambda), and fit statistics.

    Raises:
        ImportError: If spreg or libpysal is not installed.
    """
    try:
        import numpy as np
        from libpysal.weights import W
        from spreg import ML_Error
    except ImportError as exc:
        msg = (
            "spreg and libpysal are required for spatial_error_model(). "
            "Install with: pip install nyc311[stats]"
        )
        raise ImportError(msg) from exc

    df = _extract_cross_section(panel, outcome, regressors, period)
    unit_ids = list(df.index)

    neighbors = {uid: list(weights.get(str(uid), {}).keys()) for uid in unit_ids}
    weight_vals = {uid: list(weights.get(str(uid), {}).values()) for uid in unit_ids}
    w = W(neighbors, weight_vals)

    y = np.asarray(df[outcome].values, dtype=float).reshape(-1, 1)
    x = np.column_stack([np.asarray(df[r].values, dtype=float) for r in regressors])

    model = ML_Error(y, x, w, name_y=outcome, name_x=list(regressors))

    var_names = ["CONSTANT", *regressors]
    n_betas = len(var_names)
    coefficients = {var_names[i]: float(model.betas[i][0]) for i in range(n_betas)}
    std_errors = {var_names[i]: float(model.std_err[i]) for i in range(n_betas)}  # pylint: disable=no-member
    p_values = {var_names[i]: float(model.z_stat[i][1]) for i in range(n_betas)}  # pylint: disable=no-member

    lam = float(model.betas[n_betas][0])
    lam_p = float(model.z_stat[n_betas][1])  # pylint: disable=no-member

    return SpatialErrorResult(
        coefficients=coefficients,
        std_errors=std_errors,
        p_values=p_values,
        lam=lam,
        lam_p_value=lam_p,
        log_likelihood=float(model.logll),
        aic=float(model.aic),
        n_observations=int(model.n),
        model_summary=str(model.summary),  # pylint: disable=no-member
    )

spatial_lag_model ¶

spatial_lag_model(
    panel: PanelDataset,
    weights: dict[str, dict[str, float]],
    outcome: str,
    regressors: tuple[str, ...],
    *,
    period: str | None = None,
) -> SpatialLagResult

Fit a spatial lag (SAR) model via maximum likelihood.

Estimates: y = rho * W @ y + X @ beta + epsilon

Parameters:

Name	Type	Description	Default
`panel`	`PanelDataset`	A :class:`PanelDataset` containing the outcome and regressor columns.	required
`weights`	`dict[str, dict[str, float]]`	Nested dict `{unit_a: {unit_b: weight}}` of spatial weights (row-standardized).	required
`outcome`	`str`	Column name for the dependent variable.	required
`regressors`	`tuple[str, ...]`	Column names for the independent variables.	required
`period`	`str \| None`	If given, extract only this period as a cross-section. If `None`, collapse across periods via group means.	`None`

Returns:

Name	Type	Description
`A`	`SpatialLagResult`	class:`SpatialLagResult` with estimated coefficients, the
	`SpatialLagResult`	spatial autoregressive parameter (rho), and fit statistics.

Raises:

Type	Description
`ImportError`	If spreg or libpysal is not installed.

Source code in src/nyc311/stats/_spatial_regression.py

def spatial_lag_model(
    panel: PanelDataset,
    weights: dict[str, dict[str, float]],
    outcome: str,
    regressors: tuple[str, ...],
    *,
    period: str | None = None,
) -> SpatialLagResult:
    """Fit a spatial lag (SAR) model via maximum likelihood.

    Estimates: y = rho * W @ y + X @ beta + epsilon

    Args:
        panel: A :class:`PanelDataset` containing the outcome and
            regressor columns.
        weights: Nested dict ``{unit_a: {unit_b: weight}}`` of spatial
            weights (row-standardized).
        outcome: Column name for the dependent variable.
        regressors: Column names for the independent variables.
        period: If given, extract only this period as a cross-section.
            If ``None``, collapse across periods via group means.

    Returns:
        A :class:`SpatialLagResult` with estimated coefficients, the
        spatial autoregressive parameter (rho), and fit statistics.

    Raises:
        ImportError: If spreg or libpysal is not installed.
    """
    try:
        import numpy as np
        from libpysal.weights import W
        from spreg import ML_Lag
    except ImportError as exc:
        msg = (
            "spreg and libpysal are required for spatial_lag_model(). "
            "Install with: pip install nyc311[stats]"
        )
        raise ImportError(msg) from exc

    df = _extract_cross_section(panel, outcome, regressors, period)
    unit_ids = list(df.index)

    neighbors = {uid: list(weights.get(str(uid), {}).keys()) for uid in unit_ids}
    weight_vals = {uid: list(weights.get(str(uid), {}).values()) for uid in unit_ids}
    w = W(neighbors, weight_vals)

    y = np.asarray(df[outcome].values, dtype=float).reshape(-1, 1)
    x = np.column_stack([np.asarray(df[r].values, dtype=float) for r in regressors])

    model = ML_Lag(y, x, w, name_y=outcome, name_x=list(regressors))

    var_names = ["CONSTANT", *regressors]
    n_betas = len(var_names)
    coefficients = {var_names[i]: float(model.betas[i][0]) for i in range(n_betas)}
    std_errors = {var_names[i]: float(model.std_err[i]) for i in range(n_betas)}  # pylint: disable=no-member
    p_values = {var_names[i]: float(model.z_stat[i][1]) for i in range(n_betas)}  # pylint: disable=no-member

    rho = float(model.betas[n_betas][0])
    rho_p = float(model.z_stat[n_betas][1])  # pylint: disable=no-member

    return SpatialLagResult(
        coefficients=coefficients,
        std_errors=std_errors,
        p_values=p_values,
        rho=rho,
        rho_p_value=rho_p,
        log_likelihood=float(model.logll),
        aic=float(model.aic),
        n_observations=int(model.n),
        model_summary=str(model.summary),  # pylint: disable=no-member
    )

event_study ¶

event_study(
    panel: PanelDataset,
    outcome: str,
    *,
    covariates: tuple[str, ...] = (),
    pre_periods: int = 5,
    post_periods: int = 5,
    reference_period: int = -1,
) -> EventStudyResult

Estimate event-study coefficients with pre-trend diagnostics.

Computes mean differences between treated and control units at each relative time period, with reference_period normalized to zero.

Parameters:

Name	Type	Description	Default
`panel`	`PanelDataset`	A :class:`PanelDataset` with `treatment_events`.	required
`outcome`	`str`	Column name for the outcome variable.	required
`covariates`	`tuple[str, ...]`	Additional control variable column names.	`()`
`pre_periods`	`int`	Number of pre-treatment periods to include.	`5`
`post_periods`	`int`	Number of post-treatment periods to include.	`5`
`reference_period`	`int`	Relative period to normalize to zero. Defaults to `-1` (one period before treatment).	`-1`

Returns:

Name	Type	Description
`An`	`EventStudyResult`	class:`EventStudyResult` with coefficients per relative
	`EventStudyResult`	period, confidence intervals, and a pre-trend F-test.

Raises:

Type	Description
`ImportError`	If required packages are not installed.

Source code in src/nyc311/stats/_staggered_did.py

def event_study(
    panel: PanelDataset,
    outcome: str,
    *,
    covariates: tuple[str, ...] = (),
    pre_periods: int = 5,
    post_periods: int = 5,
    reference_period: int = -1,
) -> EventStudyResult:
    """Estimate event-study coefficients with pre-trend diagnostics.

    Computes mean differences between treated and control units at
    each relative time period, with ``reference_period`` normalized
    to zero.

    Args:
        panel: A :class:`PanelDataset` with ``treatment_events``.
        outcome: Column name for the outcome variable.
        covariates: Additional control variable column names.
        pre_periods: Number of pre-treatment periods to include.
        post_periods: Number of post-treatment periods to include.
        reference_period: Relative period to normalize to zero.
            Defaults to ``-1`` (one period before treatment).

    Returns:
        An :class:`EventStudyResult` with coefficients per relative
        period, confidence intervals, and a pre-trend F-test.

    Raises:
        ImportError: If required packages are not installed.
    """
    try:
        import numpy as np
        import pandas as pd
        from scipy.stats import f as f_dist
    except ImportError as exc:
        msg = (
            "numpy, pandas, and scipy are required for event_study(). "
            "Install with: pip install nyc311[stats]"
        )
        raise ImportError(msg) from exc

    _ = covariates  # reserved for future covariate adjustment

    df = panel.to_dataframe()
    if isinstance(df.index, pd.MultiIndex):
        df = df.reset_index()

    unit_treatment_dates: dict[str, str] = {}
    for te in panel.treatment_events:
        date_str = te.treatment_date.isoformat()[:7]
        for uid in te.treated_units:
            if uid not in unit_treatment_dates or date_str < unit_treatment_dates[uid]:
                unit_treatment_dates[uid] = date_str

    periods = sorted(df["period"].unique())
    period_to_idx = {p: i for i, p in enumerate(periods)}

    treated_units = set(unit_treatment_dates.keys())
    control_units = set(df["unit_id"].unique()) - treated_units

    rel_range = list(range(-pre_periods, post_periods + 1))
    coeffs: list[float] = []
    ses: list[float] = []

    for rel in rel_range:
        diffs: list[float] = []
        for uid, treat_period in unit_treatment_dates.items():
            if treat_period not in period_to_idx:
                continue
            abs_idx = period_to_idx[treat_period] + rel
            if abs_idx < 0 or abs_idx >= len(periods):
                continue
            target_period = periods[abs_idx]

            treat_vals = df[(df["unit_id"] == uid) & (df["period"] == target_period)][
                outcome
            ].to_numpy()
            ctrl_vals = df[
                (df["unit_id"].isin(control_units)) & (df["period"] == target_period)
            ][outcome].to_numpy()

            if len(treat_vals) > 0 and len(ctrl_vals) > 0:
                diffs.append(float(np.mean(treat_vals) - np.mean(ctrl_vals)))

        if diffs:
            coeffs.append(float(np.mean(diffs)))
            ses.append(
                float(np.std(diffs, ddof=1) / np.sqrt(len(diffs)))
                if len(diffs) > 1
                else 0.0
            )
        else:
            coeffs.append(0.0)
            ses.append(0.0)

    ref_idx = rel_range.index(reference_period) if reference_period in rel_range else 0
    ref_coeff = coeffs[ref_idx]
    coeffs = [c - ref_coeff for c in coeffs]

    ci_lower = [c - 1.96 * s for c, s in zip(coeffs, ses, strict=True)]
    ci_upper = [c + 1.96 * s for c, s in zip(coeffs, ses, strict=True)]

    pre_indices = [
        i for i, r in enumerate(rel_range) if r < 0 and r != reference_period
    ]
    pre_f = None
    pre_p = None
    if pre_indices and any(ses[i] > 0 for i in pre_indices):
        pre_coeffs = np.array([coeffs[i] for i in pre_indices])
        pre_ses = np.array([max(ses[i], 1e-10) for i in pre_indices])
        f_stat = float(np.mean((pre_coeffs / pre_ses) ** 2))
        k = len(pre_indices)
        pre_f = f_stat
        pre_p = float(1.0 - f_dist.cdf(f_stat, k, max(k, 1)))

    return EventStudyResult(
        coefficients=tuple(coeffs),
        std_errors=tuple(ses),
        ci_lower=tuple(ci_lower),
        ci_upper=tuple(ci_upper),
        relative_periods=tuple(rel_range),
        pre_trend_f_statistic=pre_f,
        pre_trend_p_value=pre_p,
        reference_period=reference_period,
    )

staggered_did ¶

staggered_did(
    panel: PanelDataset,
    outcome: str,
    *,
    covariates: tuple[str, ...] = (),
    cluster: str = "entity",
) -> StaggeredDiDResult

Estimate group-time ATTs under staggered treatment adoption.

Uses two-way fixed effects with interaction terms for each treatment cohort and post-treatment period, avoiding the well-documented bias of naive TWFE under staggered rollouts.

Parameters:

Name	Type	Description	Default
`panel`	`PanelDataset`	A :class:`PanelDataset` with `treatment_events` specifying when each unit began treatment.	required
`outcome`	`str`	Column name for the outcome variable.	required
`covariates`	`tuple[str, ...]`	Additional control variable column names.	`()`
`cluster`	`str`	Clustering level for standard errors. One of `"entity"` (default) or `"time"`.	`'entity'`

Returns:

Name	Type	Description
`A`	`StaggeredDiDResult`	class:`StaggeredDiDResult` with group-time ATTs,
	`StaggeredDiDResult`	aggregated ATT, and confidence intervals.

Raises:

Type	Description
`ImportError`	If required packages are not installed.
`ValueError`	If no treatment events are found.

Source code in src/nyc311/stats/_staggered_did.py

def staggered_did(
    panel: PanelDataset,
    outcome: str,
    *,
    covariates: tuple[str, ...] = (),
    cluster: str = "entity",
) -> StaggeredDiDResult:
    """Estimate group-time ATTs under staggered treatment adoption.

    Uses two-way fixed effects with interaction terms for each
    treatment cohort and post-treatment period, avoiding the
    well-documented bias of naive TWFE under staggered rollouts.

    Args:
        panel: A :class:`PanelDataset` with ``treatment_events``
            specifying when each unit began treatment.
        outcome: Column name for the outcome variable.
        covariates: Additional control variable column names.
        cluster: Clustering level for standard errors. One of
            ``"entity"`` (default) or ``"time"``.

    Returns:
        A :class:`StaggeredDiDResult` with group-time ATTs,
        aggregated ATT, and confidence intervals.

    Raises:
        ImportError: If required packages are not installed.
        ValueError: If no treatment events are found.
    """
    try:
        import numpy as np
        import pandas as pd
        from scipy.stats import norm
    except ImportError as exc:
        msg = (
            "numpy, pandas, and scipy are required for staggered_did(). "
            "Install with: pip install nyc311[stats]"
        )
        raise ImportError(msg) from exc

    _ = covariates, cluster  # reserved for future covariate adjustment and clustering

    if not panel.treatment_events:
        msg = "Panel must have at least one treatment event."
        raise ValueError(msg)

    df = panel.to_dataframe()
    if isinstance(df.index, pd.MultiIndex):
        df = df.reset_index()

    unit_treatment_dates: dict[str, str] = {}
    for te in panel.treatment_events:
        date_str = te.treatment_date.isoformat()[:7]
        for uid in te.treated_units:
            if uid not in unit_treatment_dates or date_str < unit_treatment_dates[uid]:
                unit_treatment_dates[uid] = date_str

    df["cohort"] = df["unit_id"].map(unit_treatment_dates).fillna("never")
    df["post"] = ((df["cohort"] != "never") & (df["period"] >= df["cohort"])).astype(
        int
    )

    cohorts = sorted(set(unit_treatment_dates.values()))
    periods = sorted(df["period"].unique())

    gt_atts: list[GroupTimeATT] = []
    for cohort in cohorts:
        cohort_units = df[df["cohort"] == cohort]
        never_units = df[df["cohort"] == "never"]

        for period in periods:
            treated_obs = cohort_units[cohort_units["period"] == period]
            control_obs = never_units[never_units["period"] == period]

            if len(treated_obs) == 0 or len(control_obs) == 0:
                continue

            y_t = treated_obs[outcome].to_numpy().astype(float)
            y_c = control_obs[outcome].to_numpy().astype(float)

            att_val = float(np.mean(y_t) - np.mean(y_c))
            var_t = float(np.var(y_t, ddof=1)) if len(y_t) > 1 else 0.0
            var_c = float(np.var(y_c, ddof=1)) if len(y_c) > 1 else 0.0
            se_val = float(np.sqrt(var_t / len(y_t) + var_c / len(y_c)))
            se_val = max(se_val, 1e-10)
            z_val = att_val / se_val
            p_val = float(2.0 * (1.0 - norm.cdf(abs(z_val))))

            gt_atts.append(
                GroupTimeATT(
                    group=cohort,
                    period=period,
                    att=att_val,
                    se=se_val,
                    p_value=p_val,
                )
            )

    if gt_atts:
        atts = np.array([g.att for g in gt_atts])
        ses = np.array([g.se for g in gt_atts])
        weights = 1.0 / np.maximum(ses**2, 1e-20)
        agg_att = float(np.average(atts, weights=weights))
        agg_se = float(1.0 / np.sqrt(np.sum(weights)))
        z_agg = agg_att / max(agg_se, 1e-10)
        agg_p = float(2.0 * (1.0 - norm.cdf(abs(z_agg))))
        agg_ci_lower = agg_att - 1.96 * agg_se
        agg_ci_upper = agg_att + 1.96 * agg_se
    else:
        agg_att = 0.0
        agg_se = 0.0
        agg_p = 1.0
        agg_ci_lower = 0.0
        agg_ci_upper = 0.0

    summary = (
        f"Staggered DiD: {len(cohorts)} cohort(s), {len(periods)} periods\n"
        f"Group-time ATTs: {len(gt_atts)}\n"
        f"Aggregated ATT: {agg_att:.4f} (SE={agg_se:.4f}, p={agg_p:.4f})"
    )

    return StaggeredDiDResult(
        group_time_atts=tuple(gt_atts),
        aggregated_att=agg_att,
        aggregated_se=agg_se,
        aggregated_p_value=agg_p,
        aggregated_ci_lower=agg_ci_lower,
        aggregated_ci_upper=agg_ci_upper,
        n_groups=len(cohorts),
        n_periods=len(periods),
        model_summary=summary,
    )

synthetic_control ¶

synthetic_control(
    panel: PanelDataset,
    treated_unit: str,
    outcome: str,
    *,
    predictors: tuple[str, ...] = (),
    n_placebo_runs: int = 0,
) -> SyntheticControlResult

Estimate a treatment effect using the synthetic control method.

Constructs a weighted combination of untreated donor units that best reproduces the treated unit's pre-treatment trajectory, then measures the post-treatment divergence as the treatment effect.

Parameters:

Name	Type	Description	Default
`panel`	`PanelDataset`	A :class:`PanelDataset` with treatment information.	required
`treated_unit`	`str`	The unit ID of the treated unit.	required
`outcome`	`str`	Column name for the outcome variable.	required
`predictors`	`tuple[str, ...]`	Additional predictor columns for matching.	`()`
`n_placebo_runs`	`int`	Number of in-space placebos for inference. When `> 0`, each donor unit is iteratively treated and the ratio of post/pre MSPE is used to compute a p-value. Defaults to `0` (no placebos).	`0`

Returns:

Name	Type	Description
`A`	`SyntheticControlResult`	class:`SyntheticControlResult` with donor weights,
	`SyntheticControlResult`	counterfactual series, treatment effects, and optionally a
	`SyntheticControlResult`	placebo p-value.

Raises:

Type	Description
`ImportError`	If pysyncon is not installed.
`ValueError`	If the treated unit is not found in the panel.

Source code in src/nyc311/stats/_synthetic_control.py

def synthetic_control(
    panel: PanelDataset,
    treated_unit: str,
    outcome: str,
    *,
    predictors: tuple[str, ...] = (),
    n_placebo_runs: int = 0,
) -> SyntheticControlResult:
    """Estimate a treatment effect using the synthetic control method.

    Constructs a weighted combination of untreated donor units that
    best reproduces the treated unit's pre-treatment trajectory, then
    measures the post-treatment divergence as the treatment effect.

    Args:
        panel: A :class:`PanelDataset` with treatment information.
        treated_unit: The unit ID of the treated unit.
        outcome: Column name for the outcome variable.
        predictors: Additional predictor columns for matching.
        n_placebo_runs: Number of in-space placebos for inference.
            When ``> 0``, each donor unit is iteratively treated and
            the ratio of post/pre MSPE is used to compute a p-value.
            Defaults to ``0`` (no placebos).

    Returns:
        A :class:`SyntheticControlResult` with donor weights,
        counterfactual series, treatment effects, and optionally a
        placebo p-value.

    Raises:
        ImportError: If pysyncon is not installed.
        ValueError: If the treated unit is not found in the panel.
    """
    try:
        import numpy as np
        import pandas as pd
    except ImportError as exc:
        msg = (
            "numpy and pandas are required for synthetic_control(). "
            "Install with: pip install nyc311[stats]"
        )
        raise ImportError(msg) from exc

    _ = predictors  # reserved for future matching on covariates

    df = panel.to_dataframe()
    if isinstance(df.index, pd.MultiIndex):
        df = df.reset_index()

    if treated_unit not in df["unit_id"].to_numpy():
        msg = f"treated_unit {treated_unit!r} not found in panel."
        raise ValueError(msg)

    treatment_event = None
    for te in panel.treatment_events:
        if treated_unit in te.treated_units:
            treatment_event = te
            break

    if treatment_event is None:
        msg = f"No treatment event found for unit {treated_unit!r}."
        raise ValueError(msg)

    treatment_date_str = treatment_event.treatment_date.isoformat()[:7]
    periods = sorted(df["period"].unique())
    pre_periods = [p for p in periods if p < treatment_date_str]
    post_periods = [p for p in periods if p >= treatment_date_str]

    donor_ids = [
        uid
        for uid in panel.unit_ids
        if uid != treated_unit and uid not in treatment_event.treated_units
    ]

    pivot = df.pivot_table(
        index="period", columns="unit_id", values=outcome, aggfunc="mean"
    )
    pivot = pivot.reindex(periods)

    treated_pre = pivot.loc[pre_periods, treated_unit].to_numpy().astype(float)
    donor_pre = pivot.loc[pre_periods, donor_ids].to_numpy().astype(float)

    valid_donors = ~np.isnan(donor_pre).any(axis=0)
    donor_ids_clean = [donor_ids[i] for i in range(len(donor_ids)) if valid_donors[i]]
    donor_pre = donor_pre[:, valid_donors]

    from scipy.optimize import minimize

    def _loss(w: Any) -> float:
        synthetic = donor_pre @ w
        return float(np.sum((treated_pre - synthetic) ** 2))

    n_donors = len(donor_ids_clean)
    w0 = np.ones(n_donors) / n_donors
    bounds = [(0.0, 1.0)] * n_donors
    constraints = {"type": "eq", "fun": lambda w: np.sum(w) - 1.0}

    res = minimize(_loss, w0, method="SLSQP", bounds=bounds, constraints=constraints)
    w_star = res.x

    treated_full = pivot.loc[periods, treated_unit].to_numpy().astype(float)
    donor_full = pivot.loc[periods, donor_ids_clean].to_numpy().astype(float)
    counterfactual = donor_full @ w_star
    effect = treated_full - counterfactual

    pre_mspe = float(np.mean((treated_pre - donor_pre @ w_star) ** 2))
    post_mask = [p in post_periods for p in periods]
    att = float(np.mean(effect[np.array(post_mask)]))

    donor_weights = {
        uid: float(w_star[i])
        for i, uid in enumerate(donor_ids_clean)
        if w_star[i] > 1e-4
    }

    placebo_p = None
    if n_placebo_runs > 0 and len(donor_ids_clean) > 0:
        treated_ratio = _mspe_ratio(effect, pre_periods, post_periods, periods)
        more_extreme = 0
        for placebo_unit in donor_ids_clean[:n_placebo_runs]:
            placebo_pre = pivot.loc[pre_periods, placebo_unit].to_numpy().astype(float)
            other_donors = [d for d in donor_ids_clean if d != placebo_unit]
            placebo_donor_pre = (
                pivot.loc[pre_periods, other_donors].to_numpy().astype(float)
            )

            n_pd = len(other_donors)
            pw0 = np.ones(n_pd) / n_pd

            def _ploss(
                w: Any, _pp: Any = placebo_pre, _dp: Any = placebo_donor_pre
            ) -> float:
                return float(np.sum((_pp - _dp @ w) ** 2))

            pbounds = [(0.0, 1.0)] * n_pd
            pcons = {"type": "eq", "fun": lambda w: np.sum(w) - 1.0}
            pres = minimize(
                _ploss, pw0, method="SLSQP", bounds=pbounds, constraints=pcons
            )

            placebo_full = pivot.loc[periods, placebo_unit].to_numpy().astype(float)
            placebo_donor_full = (
                pivot.loc[periods, other_donors].to_numpy().astype(float)
            )
            placebo_effect = placebo_full - placebo_donor_full @ pres.x

            pr = _mspe_ratio(placebo_effect, pre_periods, post_periods, periods)
            if pr >= treated_ratio:
                more_extreme += 1

        placebo_p = (more_extreme + 1) / (n_placebo_runs + 1)

    summary_lines = [
        f"Synthetic Control: {treated_unit}",
        f"Pre-treatment periods: {len(pre_periods)}",
        f"Post-treatment periods: {len(post_periods)}",
        f"Donors used: {len(donor_weights)}",
        f"Pre-treatment MSPE: {pre_mspe:.6f}",
        f"ATT: {att:.4f}",
    ]
    if placebo_p is not None:
        summary_lines.append(f"Placebo p-value: {placebo_p:.4f}")

    return SyntheticControlResult(
        treated_unit=treated_unit,
        donor_weights=donor_weights,
        counterfactual=tuple(float(c) for c in counterfactual),
        observed=tuple(float(o) for o in treated_full),
        treatment_effect=tuple(float(e) for e in effect),
        att=att,
        periods=tuple(str(p) for p in periods),
        pre_treatment_mspe=pre_mspe,
        placebo_p_value=placebo_p,
        model_summary="\n".join(summary_lines),
    )

CLI¶

nyc311.cli ¶

Command-line entrypoints for nyc311.

main ¶

main(argv: Sequence[str] | None = None) -> int

Run the implemented fetch and complaint-topic export commands.

Source code in src/nyc311/cli/_main.py

def main(argv: Sequence[str] | None = None) -> int:
    """Run the implemented fetch and complaint-topic export commands."""
    parser = build_parser()
    args = parser.parse_args(list(argv) if argv is not None else None)
    if args.command == "topics":
        filters = build_service_request_filter(args)
        if args.format == "geojson" and not args.boundaries:
            parser.error("--boundaries is required when --format geojson is used.")

        run_topic_pipeline(
            args.source,
            args.complaint_type,
            geography=args.geography,
            filters=filters,
            top_n=args.top_n,
            output=Path(args.output),
            output_format=args.format,
            boundaries=args.boundaries,
        )
        return 0

    if args.command == "fetch":
        filters = build_service_request_filter(args)
        fetch_service_requests(
            filters=filters,
            socrata_config=build_socrata_config(args),
            output=Path(args.output),
        )
        return 0

    raise AssertionError(f"Unsupported command: {args.command}")

API Reference¶

Root Package¶

nyc311 ¶

Models¶

nyc311.models ¶

BoundaryCollection module-attribute ¶

BoundaryFeature module-attribute ¶

BOROUGH_BRONX module-attribute ¶

BOROUGH_BROOKLYN module-attribute ¶

BOROUGH_MANHATTAN module-attribute ¶

BOROUGH_QUEENS module-attribute ¶

BOROUGH_STATEN_ISLAND module-attribute ¶

SOCRATA_DATASET_IDENTIFIER module-attribute ¶

SUPPORTED_BOROUGHS module-attribute ¶

SUPPORTED_BOUNDARY_GEOGRAPHIES module-attribute ¶

SUPPORTED_GEOGRAPHIES module-attribute ¶

SUPPORTED_RECORD_GEOGRAPHIES module-attribute ¶

BoroughName module-attribute ¶

AnalysisWindow dataclass ¶

days instance-attribute ¶

AnomalyResult dataclass ¶

geography instance-attribute ¶

geography_value instance-attribute ¶

complaint_type instance-attribute ¶

topic instance-attribute ¶

complaint_count instance-attribute ¶

geography_total_count instance-attribute ¶

share_of_geography instance-attribute ¶

topic_rank instance-attribute ¶

z_score instance-attribute ¶

is_anomaly instance-attribute ¶

window_days instance-attribute ¶

anomaly_threshold instance-attribute ¶

ExportTarget dataclass ¶

format instance-attribute ¶

output_path instance-attribute ¶

GeographyTopicSummary dataclass ¶

geography instance-attribute ¶

geography_value instance-attribute ¶

complaint_type instance-attribute ¶

topic instance-attribute ¶

complaint_count instance-attribute ¶

geography_total_count instance-attribute ¶

share_of_geography instance-attribute ¶

topic_rank instance-attribute ¶

is_dominant_topic instance-attribute ¶

ResolutionGapSummary dataclass ¶

geography instance-attribute ¶

geography_value instance-attribute ¶

complaint_type instance-attribute ¶

total_request_count instance-attribute ¶

resolved_request_count instance-attribute ¶

unresolved_request_count instance-attribute ¶

unresolved_share instance-attribute ¶

resolution_rate instance-attribute ¶

TopicCoverageReport dataclass ¶

complaint_type instance-attribute ¶

total_records instance-attribute ¶

matched_records instance-attribute ¶

other_records instance-attribute ¶

coverage_rate instance-attribute ¶

top_unmatched_descriptors instance-attribute ¶

TopicQuery dataclass ¶

complaint_type instance-attribute ¶

top_n class-attribute instance-attribute ¶

BoundaryGeoJSONExport dataclass ¶

boundaries instance-attribute ¶

summaries instance-attribute ¶

GeographyFilter dataclass ¶

geography instance-attribute ¶

value instance-attribute ¶

ServiceRequestFilter dataclass ¶

start_date class-attribute instance-attribute ¶

end_date class-attribute instance-attribute ¶

geography class-attribute instance-attribute ¶

complaint_types class-attribute instance-attribute ¶

SocrataConfig dataclass ¶

dataset_identifier class-attribute instance-attribute ¶

base_url class-attribute instance-attribute ¶

app_token class-attribute instance-attribute ¶

BoundaryCollection `module-attribute` ¶

BoundaryFeature `module-attribute` ¶

BOROUGH_BRONX `module-attribute` ¶

BOROUGH_BROOKLYN `module-attribute` ¶

BOROUGH_MANHATTAN `module-attribute` ¶

BOROUGH_QUEENS `module-attribute` ¶

BOROUGH_STATEN_ISLAND `module-attribute` ¶

SOCRATA_DATASET_IDENTIFIER `module-attribute` ¶

SUPPORTED_BOROUGHS `module-attribute` ¶

SUPPORTED_BOUNDARY_GEOGRAPHIES `module-attribute` ¶

SUPPORTED_GEOGRAPHIES `module-attribute` ¶

SUPPORTED_RECORD_GEOGRAPHIES `module-attribute` ¶

BoroughName `module-attribute` ¶

AnalysisWindow `dataclass` ¶

days `instance-attribute` ¶

AnomalyResult `dataclass` ¶

geography `instance-attribute` ¶

geography_value `instance-attribute` ¶

complaint_type `instance-attribute` ¶

topic `instance-attribute` ¶

complaint_count `instance-attribute` ¶

geography_total_count `instance-attribute` ¶

share_of_geography `instance-attribute` ¶

topic_rank `instance-attribute` ¶

z_score `instance-attribute` ¶

is_anomaly `instance-attribute` ¶

window_days `instance-attribute` ¶

anomaly_threshold `instance-attribute` ¶

ExportTarget `dataclass` ¶

format `instance-attribute` ¶

output_path `instance-attribute` ¶

GeographyTopicSummary `dataclass` ¶

geography `instance-attribute` ¶

geography_value `instance-attribute` ¶

complaint_type `instance-attribute` ¶

topic `instance-attribute` ¶

complaint_count `instance-attribute` ¶

geography_total_count `instance-attribute` ¶

share_of_geography `instance-attribute` ¶

topic_rank `instance-attribute` ¶

is_dominant_topic `instance-attribute` ¶

ResolutionGapSummary `dataclass` ¶

geography `instance-attribute` ¶

geography_value `instance-attribute` ¶

complaint_type `instance-attribute` ¶

total_request_count `instance-attribute` ¶

resolved_request_count `instance-attribute` ¶

unresolved_request_count `instance-attribute` ¶

unresolved_share `instance-attribute` ¶

resolution_rate `instance-attribute` ¶

TopicCoverageReport `dataclass` ¶

complaint_type `instance-attribute` ¶

total_records `instance-attribute` ¶

matched_records `instance-attribute` ¶

other_records `instance-attribute` ¶

coverage_rate `instance-attribute` ¶

top_unmatched_descriptors `instance-attribute` ¶

TopicQuery `dataclass` ¶

complaint_type `instance-attribute` ¶

top_n `class-attribute` `instance-attribute` ¶

BoundaryGeoJSONExport `dataclass` ¶

boundaries `instance-attribute` ¶

summaries `instance-attribute` ¶

GeographyFilter `dataclass` ¶

geography `instance-attribute` ¶

value `instance-attribute` ¶

ServiceRequestFilter `dataclass` ¶

start_date `class-attribute` `instance-attribute` ¶

end_date `class-attribute` `instance-attribute` ¶

geography `class-attribute` `instance-attribute` ¶

complaint_types `class-attribute` `instance-attribute` ¶

SocrataConfig `dataclass` ¶

dataset_identifier `class-attribute` `instance-attribute` ¶

base_url `class-attribute` `instance-attribute` ¶

app_token `class-attribute` `instance-attribute` ¶

page_size `class-attribute` `instance-attribute` ¶

request_timeout_seconds `class-attribute` `instance-attribute` ¶

max_pages `class-attribute` `instance-attribute` ¶

created_date_sort `class-attribute` `instance-attribute` ¶

extra_where_clauses `class-attribute` `instance-attribute` ¶