Skip to content

API Reference

The public API is organized around explicit namespaces rather than a flat root package.

The root nyc311 package is intentionally minimal and only exposes version metadata. Import functionality from the canonical public modules below.

nyc311.geographies is the one namespace that intentionally fronts another package: it preserves the 311-facing geography surface while delegating generic boundary loading and normalization primitives to nyc-geo-toolkit.

Update docstrings and exported symbols in src/nyc311/ rather than editing this reference structure by hand.

Root Package

nyc311

Minimal root namespace for the nyc311 package.

Models

nyc311.models

Public typed models and constants for the nyc311 package.

BoundaryCollection module-attribute

BoundaryCollection = BoundaryCollection

BoundaryFeature module-attribute

BoundaryFeature = BoundaryFeature

BOROUGH_BRONX module-attribute

BOROUGH_BRONX: Final[BoroughName] = 'BRONX'

BOROUGH_BROOKLYN module-attribute

BOROUGH_BROOKLYN: Final[BoroughName] = 'BROOKLYN'

BOROUGH_MANHATTAN module-attribute

BOROUGH_MANHATTAN: Final[BoroughName] = 'MANHATTAN'

BOROUGH_QUEENS module-attribute

BOROUGH_QUEENS: Final[BoroughName] = 'QUEENS'

BOROUGH_STATEN_ISLAND module-attribute

BOROUGH_STATEN_ISLAND: Final[BoroughName] = 'STATEN ISLAND'

SOCRATA_DATASET_IDENTIFIER module-attribute

SOCRATA_DATASET_IDENTIFIER: Final[str] = 'erm2-nwe9'

SUPPORTED_BOROUGHS module-attribute

SUPPORTED_BOROUGHS: Final[tuple[BoroughName, ...]] = (
    BOROUGH_BRONX,
    BOROUGH_BROOKLYN,
    BOROUGH_MANHATTAN,
    BOROUGH_QUEENS,
    BOROUGH_STATEN_ISLAND,
)

SUPPORTED_BOUNDARY_GEOGRAPHIES module-attribute

SUPPORTED_BOUNDARY_GEOGRAPHIES: Final[tuple[str, ...]] = (
    "borough",
    "community_district",
    "council_district",
    "neighborhood_tabulation_area",
    "census_tract",
    "zcta",
)

SUPPORTED_GEOGRAPHIES module-attribute

SUPPORTED_GEOGRAPHIES: Final[tuple[str, ...]] = (
    SUPPORTED_RECORD_GEOGRAPHIES
)

SUPPORTED_RECORD_GEOGRAPHIES module-attribute

SUPPORTED_RECORD_GEOGRAPHIES: Final[tuple[str, ...]] = (
    "borough",
    "community_district",
)

BoroughName module-attribute

BoroughName = str

AnalysisWindow dataclass

Rolling time window used for trend and anomaly calculations.

Source code in src/nyc311/models/_analysis.py
12
13
14
15
16
17
18
19
20
@dataclass(frozen=True, slots=True)
class AnalysisWindow:
    """Rolling time window used for trend and anomaly calculations."""

    days: int

    def __post_init__(self) -> None:
        if self.days < 1:
            raise ValueError("days must be at least 1.")

days instance-attribute

days: int

AnomalyResult dataclass

A standardized anomaly score for one aggregated topic summary.

Source code in src/nyc311/models/_analysis.py
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
@dataclass(frozen=True, slots=True)
class AnomalyResult:
    """A standardized anomaly score for one aggregated topic summary."""

    geography: str
    geography_value: str
    complaint_type: str
    topic: str
    complaint_count: int
    geography_total_count: int
    share_of_geography: float
    topic_rank: int
    z_score: float
    is_anomaly: bool
    window_days: int
    anomaly_threshold: float

    def __post_init__(self) -> None:
        normalized_geography = self.geography.strip().lower()
        if normalized_geography not in SUPPORTED_GEOGRAPHIES:
            msg = (
                "Unsupported anomaly geography. "
                f"Expected one of {SUPPORTED_GEOGRAPHIES}, got {self.geography!r}."
            )
            raise ValueError(msg)
        if not _normalize_value(self.geography_value):
            raise ValueError("geography_value must not be empty.")
        if not _normalize_value(self.complaint_type):
            raise ValueError("complaint_type must not be empty.")
        if not _normalize_value(self.topic):
            raise ValueError("topic must not be empty.")
        if self.complaint_count < 1:
            raise ValueError("complaint_count must be at least 1.")
        if self.geography_total_count < self.complaint_count:
            raise ValueError("geography_total_count must be >= complaint_count.")
        if not 0 < self.share_of_geography <= 1:
            raise ValueError("share_of_geography must be in the interval (0, 1].")
        if self.topic_rank < 1:
            raise ValueError("topic_rank must be at least 1.")
        if self.window_days < 1:
            raise ValueError("window_days must be at least 1.")
        if self.anomaly_threshold <= 0:
            raise ValueError("anomaly_threshold must be positive.")

        object.__setattr__(self, "geography", normalized_geography)
        object.__setattr__(
            self, "geography_value", _normalize_value(self.geography_value)
        )
        object.__setattr__(
            self, "complaint_type", _normalize_value(self.complaint_type)
        )
        object.__setattr__(self, "topic", _normalize_value(self.topic))

geography instance-attribute

geography: str

geography_value instance-attribute

geography_value: str

complaint_type instance-attribute

complaint_type: str

topic instance-attribute

topic: str

complaint_count instance-attribute

complaint_count: int

geography_total_count instance-attribute

geography_total_count: int

share_of_geography instance-attribute

share_of_geography: float

topic_rank instance-attribute

topic_rank: int

z_score instance-attribute

z_score: float

is_anomaly instance-attribute

is_anomaly: bool

window_days instance-attribute

window_days: int

anomaly_threshold instance-attribute

anomaly_threshold: float

ExportTarget dataclass

Destination metadata for supported exporters.

Source code in src/nyc311/models/_analysis.py
70
71
72
73
74
75
76
77
78
79
80
81
82
@dataclass(frozen=True, slots=True)
class ExportTarget:
    """Destination metadata for supported exporters."""

    format: str
    output_path: Path

    def __post_init__(self) -> None:
        normalized_format = self.format.strip().lower()
        if not normalized_format:
            raise ValueError("format must not be empty.")
        object.__setattr__(self, "format", normalized_format)
        object.__setattr__(self, "output_path", Path(self.output_path))

format instance-attribute

format: str

output_path instance-attribute

output_path: Path

GeographyTopicSummary dataclass

An export-ready summary row for topic counts within one geography.

Source code in src/nyc311/models/_analysis.py
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
@dataclass(frozen=True, slots=True)
class GeographyTopicSummary:
    """An export-ready summary row for topic counts within one geography."""

    geography: str
    geography_value: str
    complaint_type: str
    topic: str
    complaint_count: int
    geography_total_count: int
    share_of_geography: float
    topic_rank: int
    is_dominant_topic: bool

    def __post_init__(self) -> None:
        normalized_geography = self.geography.strip().lower()
        if normalized_geography not in SUPPORTED_GEOGRAPHIES:
            msg = (
                "Unsupported geography summary. "
                f"Expected one of {SUPPORTED_GEOGRAPHIES}, got {self.geography!r}."
            )
            raise ValueError(msg)
        if self.complaint_count < 1:
            raise ValueError("complaint_count must be at least 1.")
        if self.geography_total_count < self.complaint_count:
            raise ValueError("geography_total_count must be >= complaint_count.")
        if not 0 < self.share_of_geography <= 1:
            raise ValueError("share_of_geography must be in the interval (0, 1].")
        if self.topic_rank < 1:
            raise ValueError("topic_rank must be at least 1.")
        if not _normalize_value(self.geography_value):
            raise ValueError("geography_value must not be empty.")
        if not _normalize_value(self.complaint_type):
            raise ValueError("complaint_type must not be empty.")
        if not _normalize_value(self.topic):
            raise ValueError("topic must not be empty.")

        object.__setattr__(self, "geography", normalized_geography)
        object.__setattr__(
            self, "geography_value", _normalize_value(self.geography_value)
        )
        object.__setattr__(
            self, "complaint_type", _normalize_value(self.complaint_type)
        )
        object.__setattr__(self, "topic", _normalize_value(self.topic))

geography instance-attribute

geography: str

geography_value instance-attribute

geography_value: str

complaint_type instance-attribute

complaint_type: str

topic instance-attribute

topic: str

complaint_count instance-attribute

complaint_count: int

geography_total_count instance-attribute

geography_total_count: int

share_of_geography instance-attribute

share_of_geography: float

topic_rank instance-attribute

topic_rank: int

is_dominant_topic instance-attribute

is_dominant_topic: bool

ResolutionGapSummary dataclass

A first-pass borough-level summary of unresolved complaint volume.

Source code in src/nyc311/models/_analysis.py
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
@dataclass(frozen=True, slots=True)
class ResolutionGapSummary:
    """A first-pass borough-level summary of unresolved complaint volume."""

    geography: str
    geography_value: str
    complaint_type: str
    total_request_count: int
    resolved_request_count: int
    unresolved_request_count: int
    unresolved_share: float
    resolution_rate: float

    def __post_init__(self) -> None:
        normalized_geography = self.geography.strip().lower()
        if normalized_geography not in SUPPORTED_GEOGRAPHIES:
            msg = (
                "Unsupported geography summary. "
                f"Expected one of {SUPPORTED_GEOGRAPHIES}, got {self.geography!r}."
            )
            raise ValueError(msg)
        if self.total_request_count < 1:
            raise ValueError("total_request_count must be at least 1.")
        if self.resolved_request_count < 0 or self.unresolved_request_count < 0:
            raise ValueError("resolution counts must be non-negative.")
        if (
            self.resolved_request_count + self.unresolved_request_count
            != self.total_request_count
        ):
            raise ValueError(
                "resolved_request_count + unresolved_request_count must equal total_request_count."
            )
        if not 0 <= self.unresolved_share <= 1:
            raise ValueError("unresolved_share must be in the interval [0, 1].")
        if not 0 <= self.resolution_rate <= 1:
            raise ValueError("resolution_rate must be in the interval [0, 1].")
        if not _normalize_value(self.geography_value):
            raise ValueError("geography_value must not be empty.")
        if not _normalize_value(self.complaint_type):
            raise ValueError("complaint_type must not be empty.")

        object.__setattr__(self, "geography", normalized_geography)
        object.__setattr__(
            self, "geography_value", _normalize_value(self.geography_value)
        )
        object.__setattr__(
            self, "complaint_type", _normalize_value(self.complaint_type)
        )

geography instance-attribute

geography: str

geography_value instance-attribute

geography_value: str

complaint_type instance-attribute

complaint_type: str

total_request_count instance-attribute

total_request_count: int

resolved_request_count instance-attribute

resolved_request_count: int

unresolved_request_count instance-attribute

unresolved_request_count: int

unresolved_share instance-attribute

unresolved_share: float

resolution_rate instance-attribute

resolution_rate: float

TopicCoverageReport dataclass

Coverage metadata that shows how much a topic ruleset matched.

Source code in src/nyc311/models/_analysis.py
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
@dataclass(frozen=True, slots=True)
class TopicCoverageReport:
    """Coverage metadata that shows how much a topic ruleset matched."""

    complaint_type: str
    total_records: int
    matched_records: int
    other_records: int
    coverage_rate: float
    top_unmatched_descriptors: tuple[tuple[str, int], ...]

    def __post_init__(self) -> None:
        if not _normalize_value(self.complaint_type):
            raise ValueError("complaint_type must not be empty.")
        if self.total_records < 0:
            raise ValueError("total_records must be non-negative.")
        if self.matched_records < 0:
            raise ValueError("matched_records must be non-negative.")
        if self.other_records < 0:
            raise ValueError("other_records must be non-negative.")
        if self.matched_records + self.other_records != self.total_records:
            raise ValueError(
                "matched_records + other_records must equal total_records."
            )
        if not 0 <= self.coverage_rate <= 1:
            raise ValueError("coverage_rate must be in the interval [0, 1].")
        object.__setattr__(
            self, "complaint_type", _normalize_value(self.complaint_type)
        )

complaint_type instance-attribute

complaint_type: str

total_records instance-attribute

total_records: int

matched_records instance-attribute

matched_records: int

other_records instance-attribute

other_records: int

coverage_rate instance-attribute

coverage_rate: float

top_unmatched_descriptors instance-attribute

top_unmatched_descriptors: tuple[tuple[str, int], ...]

TopicQuery dataclass

Topic-analysis parameters for the implemented rules-based workflow.

Source code in src/nyc311/models/_analysis.py
23
24
25
26
27
28
29
30
31
32
33
34
35
36
@dataclass(frozen=True, slots=True)
class TopicQuery:
    """Topic-analysis parameters for the implemented rules-based workflow."""

    complaint_type: str
    top_n: int = 20

    def __post_init__(self) -> None:
        normalized_complaint_type = _normalize_value(self.complaint_type)
        if not normalized_complaint_type:
            raise ValueError("complaint_type must not be empty.")
        if self.top_n < 1:
            raise ValueError("top_n must be at least 1.")
        object.__setattr__(self, "complaint_type", normalized_complaint_type)

complaint_type instance-attribute

complaint_type: str

top_n class-attribute instance-attribute

top_n: int = 20

BoundaryGeoJSONExport dataclass

Combined boundary + summary payload for GeoJSON export.

Source code in src/nyc311/models/_boundaries.py
22
23
24
25
26
27
@dataclass(frozen=True, slots=True)
class BoundaryGeoJSONExport:
    """Combined boundary + summary payload for GeoJSON export."""

    boundaries: BoundaryCollection
    summaries: tuple[GeographyTopicSummary, ...]

boundaries instance-attribute

boundaries: BoundaryCollection

summaries instance-attribute

summaries: tuple[GeographyTopicSummary, ...]

GeographyFilter dataclass

A supported geography selector for implemented loading filters.

Source code in src/nyc311/models/_filters.py
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
@dataclass(frozen=True, slots=True)
class GeographyFilter:
    """A supported geography selector for implemented loading filters."""

    geography: str
    value: str

    def __post_init__(self) -> None:
        normalized_geography = self.geography.strip().lower()
        normalized_value = (
            normalize_borough_name(self.value)
            if normalized_geography == "borough"
            else _normalize_value(self.value)
        )

        if normalized_geography not in SUPPORTED_GEOGRAPHIES:
            msg = (
                "Unsupported geography filter. "
                f"Expected one of {SUPPORTED_GEOGRAPHIES}, got {self.geography!r}."
            )
            raise ValueError(msg)
        if not normalized_value:
            raise ValueError("Geography filter value must not be empty.")

        object.__setattr__(self, "geography", normalized_geography)
        object.__setattr__(self, "value", normalized_value)

geography instance-attribute

geography: str

value instance-attribute

value: str

ServiceRequestFilter dataclass

Filters for CSV and Socrata service-request loading.

Source code in src/nyc311/models/_filters.py
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
@dataclass(frozen=True, slots=True)
class ServiceRequestFilter:
    """Filters for CSV and Socrata service-request loading."""

    start_date: date | None = None
    end_date: date | None = None
    geography: GeographyFilter | None = None
    complaint_types: tuple[str, ...] = ()

    def __post_init__(self) -> None:
        if self.start_date and self.end_date and self.start_date > self.end_date:
            raise ValueError("start_date must be on or before end_date.")

        normalized_complaint_types = tuple(
            normalized
            for raw_value in self.complaint_types
            if (normalized := _normalize_value(raw_value))
        )
        object.__setattr__(self, "complaint_types", normalized_complaint_types)

start_date class-attribute instance-attribute

start_date: date | None = None

end_date class-attribute instance-attribute

end_date: date | None = None

geography class-attribute instance-attribute

geography: GeographyFilter | None = None

complaint_types class-attribute instance-attribute

complaint_types: tuple[str, ...] = ()

SocrataConfig dataclass

Configuration for the implemented live Socrata loader path.

extra_where_clauses holds additional $where fragments (Socrata SoQL) that are AND-joined after the predicates derived from :class:ServiceRequestFilter. Use for predicates not covered by the filter (e.g. latitude IS NOT NULL). Values are stripped; empty strings are dropped.

Source code in src/nyc311/models/_filters.py
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
@dataclass(frozen=True, slots=True)
class SocrataConfig:
    """Configuration for the implemented live Socrata loader path.

    ``extra_where_clauses`` holds additional ``$where`` fragments (Socrata SoQL) that
    are AND-joined after the predicates derived from :class:`ServiceRequestFilter`.
    Use for predicates not covered by the filter (e.g. ``latitude IS NOT NULL``).
    Values are stripped; empty strings are dropped.
    """

    dataset_identifier: str = SOCRATA_DATASET_IDENTIFIER
    base_url: str = "https://data.cityofnewyork.us/resource"
    app_token: str | None = None
    page_size: int = 1000
    request_timeout_seconds: float = 30.0
    max_pages: int | None = None
    created_date_sort: Literal["asc", "desc"] = "asc"
    extra_where_clauses: tuple[str, ...] = field(default_factory=tuple)

    def __post_init__(self) -> None:
        dataset_identifier = self.dataset_identifier.strip()
        base_url = self.base_url.rstrip("/")

        if not dataset_identifier:
            raise ValueError("dataset_identifier must not be empty.")
        if not base_url:
            raise ValueError("base_url must not be empty.")
        if self.page_size < 1:
            raise ValueError("page_size must be at least 1.")
        if self.request_timeout_seconds <= 0:
            raise ValueError("request_timeout_seconds must be positive.")
        if self.max_pages is not None and self.max_pages < 1:
            raise ValueError("max_pages must be at least 1 when provided.")
        if self.created_date_sort not in ("asc", "desc"):
            raise ValueError("created_date_sort must be 'asc' or 'desc'.")

        normalized_extra_where_clauses = tuple(
            normalized
            for raw_value in self.extra_where_clauses
            if (normalized := _normalize_value(raw_value))
        )
        object.__setattr__(self, "dataset_identifier", dataset_identifier)
        object.__setattr__(self, "base_url", base_url)
        object.__setattr__(self, "extra_where_clauses", normalized_extra_where_clauses)

dataset_identifier class-attribute instance-attribute

dataset_identifier: str = SOCRATA_DATASET_IDENTIFIER

base_url class-attribute instance-attribute

base_url: str = 'https://data.cityofnewyork.us/resource'

app_token class-attribute instance-attribute

app_token: str | None = None

page_size class-attribute instance-attribute

page_size: int = 1000

request_timeout_seconds class-attribute instance-attribute

request_timeout_seconds: float = 30.0

max_pages class-attribute instance-attribute

max_pages: int | None = None

created_date_sort class-attribute instance-attribute

created_date_sort: Literal['asc', 'desc'] = 'asc'

extra_where_clauses class-attribute instance-attribute

extra_where_clauses: tuple[str, ...] = field(
    default_factory=tuple
)

ServiceRequestRecord dataclass

A single loaded NYC 311-style service-request record.

.. note::

As of nyc311 v1.0.1, ``closed_date`` is carried alongside
``created_date`` so resolution-time analyses don't have to
bypass the SDK. The field is optional — Socrata returns a
null ``closed_date`` for any unresolved complaint — and
existing call sites that instantiate the record without it
keep working unchanged.
Source code in src/nyc311/models/_records.py
 17
 18
 19
 20
 21
 22
 23
 24
 25
 26
 27
 28
 29
 30
 31
 32
 33
 34
 35
 36
 37
 38
 39
 40
 41
 42
 43
 44
 45
 46
 47
 48
 49
 50
 51
 52
 53
 54
 55
 56
 57
 58
 59
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
@dataclass(frozen=True, slots=True)
class ServiceRequestRecord:
    """A single loaded NYC 311-style service-request record.

    .. note::

        As of nyc311 v1.0.1, ``closed_date`` is carried alongside
        ``created_date`` so resolution-time analyses don't have to
        bypass the SDK. The field is optional — Socrata returns a
        null ``closed_date`` for any unresolved complaint — and
        existing call sites that instantiate the record without it
        keep working unchanged.
    """

    service_request_id: str
    created_date: date
    complaint_type: str
    descriptor: str
    borough: str
    community_district: str
    resolution_description: str | None = None
    latitude: float | None = None
    longitude: float | None = None
    #: Date the complaint was closed. ``None`` for unresolved
    #: complaints. Use ``closed_date - created_date`` for resolution
    #: latency in days.
    closed_date: date | None = None

    def __post_init__(self) -> None:
        if not _normalize_value(self.service_request_id):
            raise ValueError("service_request_id must not be empty.")
        if not _normalize_value(self.complaint_type):
            raise ValueError("complaint_type must not be empty.")
        if not _normalize_value(self.borough):
            raise ValueError("borough must not be empty.")
        if not _normalize_value(self.community_district):
            raise ValueError("community_district must not be empty.")

        object.__setattr__(
            self, "service_request_id", _normalize_value(self.service_request_id)
        )
        object.__setattr__(
            self, "complaint_type", _normalize_value(self.complaint_type)
        )
        object.__setattr__(self, "descriptor", _normalize_value(self.descriptor))
        object.__setattr__(
            self, "borough", _normalize_borough_or_passthrough(self.borough)
        )
        object.__setattr__(
            self,
            "community_district",
            _normalize_community_district_or_passthrough(self.community_district),
        )

        normalized_resolution = (
            None
            if self.resolution_description is None
            else _normalize_value(self.resolution_description)
        )
        object.__setattr__(
            self,
            "resolution_description",
            normalized_resolution if normalized_resolution else None,
        )

        latitude, longitude = _normalize_coordinate_pair(
            self.latitude,
            self.longitude,
        )
        object.__setattr__(self, "latitude", latitude)
        object.__setattr__(self, "longitude", longitude)

    def geography_value(self, geography: str) -> str:
        """Return the value for a supported geography key."""
        normalized_geography = geography.strip().lower()
        if normalized_geography == "borough":
            return self.borough
        if normalized_geography == "community_district":
            return self.community_district
        msg = (
            "Unsupported aggregation geography. "
            f"Expected one of {SUPPORTED_GEOGRAPHIES}, got {geography!r}."
        )
        raise ValueError(msg)

service_request_id instance-attribute

service_request_id: str

created_date instance-attribute

created_date: date

complaint_type instance-attribute

complaint_type: str

descriptor instance-attribute

descriptor: str

borough instance-attribute

borough: str

community_district instance-attribute

community_district: str

resolution_description class-attribute instance-attribute

resolution_description: str | None = None

latitude class-attribute instance-attribute

latitude: float | None = None

longitude class-attribute instance-attribute

longitude: float | None = None

closed_date class-attribute instance-attribute

closed_date: date | None = None

geography_value

geography_value(geography: str) -> str

Return the value for a supported geography key.

Source code in src/nyc311/models/_records.py
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
def geography_value(self, geography: str) -> str:
    """Return the value for a supported geography key."""
    normalized_geography = geography.strip().lower()
    if normalized_geography == "borough":
        return self.borough
    if normalized_geography == "community_district":
        return self.community_district
    msg = (
        "Unsupported aggregation geography. "
        f"Expected one of {SUPPORTED_GEOGRAPHIES}, got {geography!r}."
    )
    raise ValueError(msg)

TopicAssignment dataclass

A deterministic topic label derived from one service-request record.

Source code in src/nyc311/models/_records.py
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
@dataclass(frozen=True, slots=True)
class TopicAssignment:
    """A deterministic topic label derived from one service-request record."""

    record: ServiceRequestRecord
    topic: str
    normalized_text: str

    def __post_init__(self) -> None:
        if not _normalize_value(self.topic):
            raise ValueError("topic must not be empty.")
        if not _normalize_value(self.normalized_text):
            raise ValueError("normalized_text must not be empty.")

        object.__setattr__(self, "topic", _normalize_value(self.topic))
        object.__setattr__(
            self, "normalized_text", _normalize_value(self.normalized_text)
        )

record instance-attribute

record: ServiceRequestRecord

topic instance-attribute

topic: str

normalized_text instance-attribute

normalized_text: str

supported_topic_queries

supported_topic_queries() -> tuple[str, ...]

Return the complaint types with implemented topic extraction.

Source code in src/nyc311/models/_constants.py
63
64
65
def supported_topic_queries() -> tuple[str, ...]:
    """Return the complaint types with implemented topic extraction."""
    return _SUPPORTED_TOPIC_QUERIES

normalize_borough_name

normalize_borough_name(value: str) -> str

Normalize a borough name or borough alias to the canonical public constant.

Source code in src/nyc311/models/_normalize.py
109
110
111
112
113
114
115
116
117
def normalize_borough_name(value: str) -> str:
    """Normalize a borough name or borough alias to the canonical public constant."""
    normalized = _normalize_borough_or_passthrough(value)
    if normalized not in SUPPORTED_BOROUGHS:
        raise ValueError(
            "Unsupported borough name. "
            f"Expected one of {SUPPORTED_BOROUGHS}, got {value!r}."
        )
    return normalized

IO

nyc311.io

Public loading helpers for service-request data.

REQUIRED_SERVICE_REQUEST_COLUMNS module-attribute

REQUIRED_SERVICE_REQUEST_COLUMNS: Final[tuple[str, ...]] = (
    SERVICE_REQUEST_CSV_COLUMNS
)

cache_path_for_request

cache_path_for_request(
    socrata_config: SocrataConfig,
    filters: ServiceRequestFilter,
    cache_dir: Path,
) -> Path

Return the deterministic CSV path for a Socrata config + filter pair.

Source code in src/nyc311/io/_cache.py
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
def cache_path_for_request(
    socrata_config: SocrataConfig,
    filters: ServiceRequestFilter,
    cache_dir: Path,
) -> Path:
    """Return the deterministic CSV path for a Socrata config + filter pair."""
    start = filters.start_date.isoformat() if filters.start_date else "none"
    end = filters.end_date.isoformat() if filters.end_date else "none"
    page = socrata_config.page_size
    sort_suffix = "_desc" if socrata_config.created_date_sort == "desc" else ""

    if filters.geography is None and not filters.complaint_types:
        name = f"all_{start}_{end}_{page}{sort_suffix}.csv"
        return cache_dir / name

    borough = "all"
    if filters.geography is not None and filters.geography.geography == "borough":
        borough = _slug(filters.geography.value)

    complaint_types = filters.complaint_types
    if not complaint_types:
        ct_slug = "all"
    elif len(complaint_types) == 1:
        ct_slug = _slug(complaint_types[0])
    else:
        joined = "_".join(sorted(_slug(c) for c in complaint_types))
        ct_slug = joined[:120]

    name = f"{borough}_{ct_slug}_{start}_{end}_{page}{sort_suffix}.csv"
    return cache_dir / name

cached_fetch

cached_fetch(
    socrata_config: SocrataConfig,
    filters: ServiceRequestFilter,
    *,
    cache_dir: Path,
    refresh: bool = False,
    request_open: Callable[..., Any] | None = None,
    max_records: int | None = None,
    on_page: Callable[[int, int], None] | None = None,
) -> Path

Stream a Socrata query to a CSV file under cache_dir; return the path.

Skips the network fetch when the file already exists and refresh is False. Rows are filtered with the same rules as :func:load_service_requests_from_socrata.

For multi-gigabyte extracts, prefer this function and analyze with chunked pandas.read_csv instead of loading via :func:load_service_requests, which materializes rows in memory.

Optional on_page is forwarded to :func:nyc311.io.iter_service_requests_from_socrata for per-HTTP-page progress (page index and row count for that page).

Source code in src/nyc311/io/_cache.py
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
def cached_fetch(
    socrata_config: SocrataConfig,
    filters: ServiceRequestFilter,
    *,
    cache_dir: Path,
    refresh: bool = False,
    request_open: Callable[..., Any] | None = None,
    max_records: int | None = None,
    on_page: Callable[[int, int], None] | None = None,
) -> Path:
    """Stream a Socrata query to a CSV file under ``cache_dir``; return the path.

    Skips the network fetch when the file already exists and ``refresh`` is False.
    Rows are filtered with the same rules as :func:`load_service_requests_from_socrata`.

    For multi-gigabyte extracts, prefer this function and analyze with chunked
    ``pandas.read_csv`` instead of loading via :func:`load_service_requests`, which
    materializes rows in memory.

    Optional ``on_page`` is forwarded to :func:`nyc311.io.iter_service_requests_from_socrata`
    for per-HTTP-page progress (page index and row count for that page).
    """
    opener = urlopen if request_open is None else request_open
    cache_dir = Path(cache_dir)
    cache_dir.mkdir(parents=True, exist_ok=True)
    output_path = cache_path_for_request(socrata_config, filters, cache_dir)
    partial_path = _partial_cache_path(output_path)

    if output_path.is_file() and not refresh:
        return output_path

    if refresh:
        if output_path.is_file():
            output_path.unlink()
        if partial_path.is_file():
            partial_path.unlink()
    elif partial_path.is_file() and not output_path.is_file():
        # Interrupted previous run left a partial file; do not treat as complete.
        partial_path.unlink()

    written = 0
    try:
        with partial_path.open("w", newline="", encoding="utf-8") as csv_file:
            writer = csv.DictWriter(csv_file, fieldnames=SERVICE_REQUEST_EXPORT_COLUMNS)
            writer.writeheader()
            for record in iter_service_requests_from_socrata(
                socrata_config,
                filters=filters,
                request_open=opener,
                on_page=on_page,
            ):
                if not record_matches_service_request_filter(record, filters):
                    continue
                _write_record_row(writer, record)
                written += 1
                if max_records is not None and written >= max_records:
                    break
        partial_path.replace(output_path)
    except BaseException:
        if partial_path.is_file():
            partial_path.unlink()
        raise

    _write_meta(output_path, written, socrata_config, filters)
    return output_path

load_service_requests_from_csv

load_service_requests_from_csv(
    source: str | Path, *, filters: ServiceRequestFilter
) -> list[ServiceRequestRecord]

Load and filter service-request records from a local CSV snapshot.

Source code in src/nyc311/io/_csv.py
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
def load_service_requests_from_csv(
    source: str | Path,
    *,
    filters: ServiceRequestFilter,
) -> list[ServiceRequestRecord]:
    """Load and filter service-request records from a local CSV snapshot."""
    source_path = Path(source)
    with source_path.open(newline="", encoding="utf-8") as csv_file:
        reader = csv.DictReader(csv_file)
        fieldnames = reader.fieldnames
        if fieldnames is None:
            raise ValueError("CSV file must include a header row.")

        community_district_column = _validate_columns(fieldnames)
        loaded_records = [
            _record_from_mapping(row, community_district_column) for row in reader
        ]

    return _apply_filters(loaded_records, filters)

load_resolution_data

load_resolution_data(
    source: str | Path | SocrataConfig,
    *,
    filters: ServiceRequestFilter | None = None,
    cache_dir: Path | str | None = None,
    refresh: bool = False,
    max_cached_records: int | None = None,
) -> list[ServiceRequestRecord]

Load the subset of service requests that already include resolution text.

Source code in src/nyc311/io/_service_requests.py
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
def load_resolution_data(
    source: str | Path | SocrataConfig,
    *,
    filters: ServiceRequestFilter | None = None,
    cache_dir: Path | str | None = None,
    refresh: bool = False,
    max_cached_records: int | None = None,
) -> list[ServiceRequestRecord]:
    """Load the subset of service requests that already include resolution text."""
    loaded_records = load_service_requests(
        source,
        filters=filters,
        cache_dir=cache_dir,
        refresh=refresh,
        max_cached_records=max_cached_records,
    )
    return [
        record for record in loaded_records if record.resolution_description is not None
    ]

load_service_requests

load_service_requests(
    source: str | Path | SocrataConfig,
    *,
    filters: ServiceRequestFilter | None = None,
    cache_dir: Path | str | None = None,
    refresh: bool = False,
    max_cached_records: int | None = None,
) -> list[ServiceRequestRecord]

Load filtered NYC 311-style service-request records from CSV or Socrata.

When source is a :class:~nyc311.models.SocrataConfig and cache_dir is set, the live API response is streamed to a deterministic CSV under cache_dir (see :func:cached_fetch), then loaded from disk. Very large extracts should use :func:cached_fetch with chunked pandas analysis instead of this helper, which returns an in-memory list.

Source code in src/nyc311/io/_service_requests.py
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
def load_service_requests(
    source: str | Path | SocrataConfig,
    *,
    filters: ServiceRequestFilter | None = None,
    cache_dir: Path | str | None = None,
    refresh: bool = False,
    max_cached_records: int | None = None,
) -> list[ServiceRequestRecord]:
    """Load filtered NYC 311-style service-request records from CSV or Socrata.

    When ``source`` is a :class:`~nyc311.models.SocrataConfig` and ``cache_dir``
    is set, the live API response is streamed to a deterministic CSV under
    ``cache_dir`` (see :func:`cached_fetch`), then loaded from disk. Very large
    extracts should use :func:`cached_fetch` with chunked pandas analysis instead
    of this helper, which returns an in-memory list.
    """
    service_request_filter = filters or ServiceRequestFilter()
    if isinstance(source, SocrataConfig):
        if cache_dir is not None:
            cache_path = Path(cache_dir)
            csv_path = cached_fetch(
                source,
                service_request_filter,
                cache_dir=cache_path,
                refresh=refresh,
                request_open=urlopen,
                max_records=max_cached_records,
            )
            return load_service_requests_from_csv(
                csv_path, filters=service_request_filter
            )
        return load_service_requests_from_socrata(
            source,
            filters=service_request_filter,
            request_open=urlopen,
        )

    return load_service_requests_from_csv(source, filters=service_request_filter)

iter_service_requests_from_socrata

iter_service_requests_from_socrata(
    socrata_config: SocrataConfig,
    *,
    filters: ServiceRequestFilter,
    request_open: Callable[..., Any],
    on_page: Callable[[int, int], None] | None = None,
) -> Iterator[ServiceRequestRecord]

Yield service-request records from Socrata without holding all pages in memory.

on_page is invoked after each successful HTTP response with (page_index, row_count_in_page) (0-based page index).

Source code in src/nyc311/io/_socrata.py
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
def iter_service_requests_from_socrata(
    socrata_config: SocrataConfig,
    *,
    filters: ServiceRequestFilter,
    request_open: Callable[..., Any],
    on_page: Callable[[int, int], None] | None = None,
) -> Iterator[ServiceRequestRecord]:
    """Yield service-request records from Socrata without holding all pages in memory.

    ``on_page`` is invoked after each successful HTTP response with
    ``(page_index, row_count_in_page)`` (0-based page index).
    """
    headers = {"Accept": "application/json"}
    if socrata_config.app_token is not None:
        headers["X-App-Token"] = socrata_config.app_token

    request_limit = socrata_config.page_size
    offset = 0
    page_count = 0

    while True:
        if (
            socrata_config.max_pages is not None
            and page_count >= socrata_config.max_pages
        ):
            break

        request_url = _build_socrata_url(socrata_config, filters, offset=offset)
        request = Request(request_url, headers=headers)
        payload = _fetch_socrata_page_json(
            request,
            request_open=request_open,
            timeout=socrata_config.request_timeout_seconds,
        )

        if on_page is not None:
            on_page(page_count, len(payload))

        if not payload:
            break

        for raw_row in payload:
            if not isinstance(raw_row, dict):
                raise ValueError(
                    "Unexpected Socrata response row; expected a JSON object."
                )
            normalized_row = _normalize_socrata_row(raw_row)
            community_district_column = (
                "community_district"
                if "community_district" in normalized_row
                else "community_board"
            )
            yield _record_from_mapping(normalized_row, community_district_column)

        if len(payload) < request_limit:
            break
        offset += request_limit
        page_count += 1

load_service_requests_from_socrata

load_service_requests_from_socrata(
    socrata_config: SocrataConfig,
    *,
    filters: ServiceRequestFilter,
    request_open: Callable[..., Any],
) -> list[ServiceRequestRecord]

Load and filter service-request records from the live Socrata API.

Source code in src/nyc311/io/_socrata.py
224
225
226
227
228
229
230
231
232
233
234
235
236
def load_service_requests_from_socrata(
    socrata_config: SocrataConfig,
    *,
    filters: ServiceRequestFilter,
    request_open: Callable[..., Any],
) -> list[ServiceRequestRecord]:
    """Load and filter service-request records from the live Socrata API."""
    records = list(
        iter_service_requests_from_socrata(
            socrata_config, filters=filters, request_open=request_open
        )
    )
    return _apply_filters(records, filters)

Analysis

nyc311.analysis

Public analysis helpers for nyc311 complaint workflows.

DEFAULT_TOPIC_RULES module-attribute

DEFAULT_TOPIC_RULES: Final[dict[str, TopicRuleSet]] = {
    "Noise - Residential": (
        (
            "party_music",
            (
                "party",
                "music",
                "speakers",
                "stereo",
                "bass",
                "television",
            ),
        ),
        (
            "construction",
            ("construction", "drilling", "jackhammer"),
        ),
        ("pet_noise", ("dog", "barking", "pet")),
        (
            "banging",
            (
                "banging",
                "thumping",
                "shaking",
                "arguing",
                "hammering",
            ),
        ),
    ),
    "Illegal Parking": (
        ("hydrant_blocking", ("hydrant", "fire hydrant")),
        ("crosswalk_blocking", ("crosswalk",)),
        ("bus_stop_blocking", ("bus stop",)),
        (
            "double_parked",
            (
                "double parked",
                "double parking",
                "double parked",
            ),
        ),
    ),
    "Blocked Driveway": (
        (
            "commercial_driveway",
            ("commercial van", "delivery truck", "truck"),
        ),
        ("overnight_blocking", ("overnight", "all night")),
        (
            "residential_driveway",
            ("residential driveway", "driveway", "garage"),
        ),
    ),
    "Rodent": (
        (
            "extermination_request",
            (
                "exterminator",
                "extermination",
                "infestation",
            ),
        ),
        ("rats_seen", ("rats", "rat", "trash bags")),
        ("mouse_condition", ("mouse", "mice", "droppings")),
    ),
    "HEAT/HOT WATER": (
        (
            "no_heat",
            (
                "no heat",
                "without heat",
                "radiator cold",
                "heat not working",
            ),
        ),
        (
            "no_hot_water",
            (
                "no hot water",
                "without hot water",
                "hot water not working",
            ),
        ),
        (
            "intermittent_heat",
            (
                "intermittent heat",
                "heat comes and goes",
                "heat inconsistent",
            ),
        ),
    ),
    "Street Condition": (
        ("pothole", ("pothole", "potholes")),
        (
            "cave_in",
            (
                "cave in",
                "cave-in",
                "sinkhole",
                "collapsed roadway",
            ),
        ),
        (
            "rough_road",
            (
                "uneven",
                "rough road",
                "broken asphalt",
                "road surface",
            ),
        ),
    ),
    "Noise - Street/Sidewalk": (
        (
            "construction",
            ("construction", "drilling", "jackhammer"),
        ),
        (
            "loud_vehicle",
            (
                "car alarm",
                "engine idling",
                "horn",
                "vehicle",
                "muffler",
            ),
        ),
        (
            "bar_noise",
            (
                "bar",
                "club",
                "restaurant",
                "patrons",
                "crowd",
            ),
        ),
    ),
    "UNSANITARY CONDITION": (
        (
            "garbage",
            ("garbage", "trash", "refuse", "debris"),
        ),
        (
            "sewage",
            ("sewage", "feces", "human waste", "overflow"),
        ),
        (
            "pest_waste",
            (
                "rodent",
                "rat",
                "mouse",
                "droppings",
                "animal waste",
            ),
        ),
    ),
    "Abandoned Vehicle": (
        (
            "derelict_vehicle",
            (
                "abandoned",
                "derelict",
                "stripped",
                "wrecked",
            ),
        ),
        (
            "unlicensed_vehicle",
            (
                "no plate",
                "no registration",
                "expired registration",
            ),
        ),
    ),
}

TopicRule module-attribute

TopicRule = tuple[str, tuple[str, ...]]

TopicRuleSet module-attribute

TopicRuleSet = tuple[TopicRule, ...]

aggregate_by_geography

aggregate_by_geography(
    topic_assignments: list[TopicAssignment], geography: str
) -> list[GeographyTopicSummary]

Aggregate deterministic topic assignments into supported geographies.

Source code in src/nyc311/analysis/_aggregation.py
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
def aggregate_by_geography(
    topic_assignments: list[TopicAssignment],
    geography: str,
) -> list[GeographyTopicSummary]:
    """Aggregate deterministic topic assignments into supported geographies."""
    if not topic_assignments:
        return []

    grouped_counts: dict[tuple[str, str, str], int] = defaultdict(int)
    geography_totals: dict[tuple[str, str], int] = defaultdict(int)

    for assignment in topic_assignments:
        geography_value = assignment.record.geography_value(geography)
        complaint_type = assignment.record.complaint_type
        grouped_counts[(geography_value, complaint_type, assignment.topic)] += 1
        geography_totals[(geography_value, complaint_type)] += 1

    grouped_topics: dict[tuple[str, str], list[tuple[str, int]]] = defaultdict(list)
    for (geography_value, complaint_type, topic), count in grouped_counts.items():
        grouped_topics[(geography_value, complaint_type)].append((topic, count))

    summaries: list[GeographyTopicSummary] = []
    for (geography_value, complaint_type), topic_counts in sorted(
        grouped_topics.items()
    ):
        ordered_topic_counts = sorted(
            topic_counts, key=lambda item: (-item[1], item[0])
        )
        total_count = geography_totals[(geography_value, complaint_type)]

        for index, (topic, count) in enumerate(ordered_topic_counts, start=1):
            summaries.append(
                GeographyTopicSummary(
                    geography=geography,
                    geography_value=geography_value,
                    complaint_type=complaint_type,
                    topic=topic,
                    complaint_count=count,
                    geography_total_count=total_count,
                    share_of_geography=count / total_count,
                    topic_rank=index,
                    is_dominant_topic=index == 1,
                )
            )

    return summaries

detect_anomalies

detect_anomalies(
    aggregated_data: list[GeographyTopicSummary],
    window: AnalysisWindow,
    *,
    z_threshold: float = 2.0,
) -> list[AnomalyResult]

Score unusually high or low aggregated topic counts via z-scores.

Source code in src/nyc311/analysis/_anomalies.py
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
def detect_anomalies(
    aggregated_data: list[GeographyTopicSummary],
    window: AnalysisWindow,
    *,
    z_threshold: float = 2.0,
) -> list[AnomalyResult]:
    """Score unusually high or low aggregated topic counts via z-scores."""
    if z_threshold <= 0:
        raise ValueError("z_threshold must be positive.")
    if not aggregated_data:
        return []

    grouped_summaries: dict[tuple[str, str], list[GeographyTopicSummary]] = defaultdict(
        list
    )
    for summary in aggregated_data:
        grouped_summaries[(summary.geography, summary.complaint_type)].append(summary)

    anomaly_results: list[AnomalyResult] = []
    for summaries in grouped_summaries.values():
        ordered_summaries = sorted(
            summaries,
            key=lambda summary: (
                summary.geography_value,
                summary.topic_rank,
                summary.topic,
            ),
        )
        z_scores = _compute_z_scores(
            [summary.complaint_count for summary in ordered_summaries]
        )
        for summary, z_score in zip(ordered_summaries, z_scores, strict=True):
            anomaly_results.append(
                AnomalyResult(
                    geography=summary.geography,
                    geography_value=summary.geography_value,
                    complaint_type=summary.complaint_type,
                    topic=summary.topic,
                    complaint_count=summary.complaint_count,
                    geography_total_count=summary.geography_total_count,
                    share_of_geography=summary.share_of_geography,
                    topic_rank=summary.topic_rank,
                    z_score=z_score,
                    is_anomaly=abs(z_score) >= z_threshold,
                    window_days=window.days,
                    anomaly_threshold=z_threshold,
                )
            )

    return sorted(
        anomaly_results,
        key=lambda result: (
            -abs(result.z_score),
            result.geography,
            result.complaint_type,
            result.geography_value,
            result.topic_rank,
            result.topic,
        ),
    )

analyze_topic_coverage

analyze_topic_coverage(
    service_requests: list[ServiceRequestRecord],
    query: TopicQuery,
    *,
    custom_rules: TopicRuleSet | None = None,
    top_unmatched_n: int = 10,
) -> TopicCoverageReport

Report how much a topic configuration matched versus falling into other.

Source code in src/nyc311/analysis/_coverage.py
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
def analyze_topic_coverage(
    service_requests: list[ServiceRequestRecord],
    query: TopicQuery,
    *,
    custom_rules: TopicRuleSet | None = None,
    top_unmatched_n: int = 10,
) -> TopicCoverageReport:
    """Report how much a topic configuration matched versus falling into other."""
    matching_records = [
        record
        for record in service_requests
        if record.complaint_type == query.complaint_type
    ]
    assignments = extract_topics(
        matching_records,
        query,
        custom_rules=custom_rules,
    )
    matched_records = sum(
        assignment.topic != _OTHER_TOPIC for assignment in assignments
    )
    other_records = len(assignments) - matched_records
    unmatched_descriptors = Counter(
        _normalize_value(assignment.record.descriptor) or _UNSPECIFIED_TEXT
        for assignment in assignments
        if assignment.topic == _OTHER_TOPIC
    )
    total_records = len(assignments)
    return TopicCoverageReport(
        complaint_type=query.complaint_type,
        total_records=total_records,
        matched_records=matched_records,
        other_records=other_records,
        coverage_rate=0 if total_records == 0 else matched_records / total_records,
        top_unmatched_descriptors=tuple(
            unmatched_descriptors.most_common(top_unmatched_n)
        ),
    )

analyze_resolution_gaps

analyze_resolution_gaps(
    service_requests: list[ServiceRequestRecord],
    resolution_data: list[ServiceRequestRecord],
) -> list[ResolutionGapSummary]

Summarize unresolved complaint share by borough and complaint type.

Source code in src/nyc311/analysis/_resolution.py
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
def analyze_resolution_gaps(
    service_requests: list[ServiceRequestRecord],
    resolution_data: list[ServiceRequestRecord],
) -> list[ResolutionGapSummary]:
    """Summarize unresolved complaint share by borough and complaint type."""
    if not service_requests:
        return []

    resolved_request_ids = {
        record.service_request_id
        for record in resolution_data
        if record.resolution_description is not None
    }
    grouped_totals: dict[tuple[str, str], int] = defaultdict(int)
    grouped_resolved: dict[tuple[str, str], int] = defaultdict(int)

    for record in service_requests:
        grouping_key = (record.borough, record.complaint_type)
        grouped_totals[grouping_key] += 1
        if (
            record.resolution_description is not None
            or record.service_request_id in resolved_request_ids
        ):
            grouped_resolved[grouping_key] += 1

    summaries: list[ResolutionGapSummary] = []
    for (borough, complaint_type), total_request_count in sorted(
        grouped_totals.items()
    ):
        resolved_request_count = grouped_resolved[(borough, complaint_type)]
        unresolved_request_count = total_request_count - resolved_request_count
        summaries.append(
            ResolutionGapSummary(
                geography="borough",
                geography_value=borough,
                complaint_type=complaint_type,
                total_request_count=total_request_count,
                resolved_request_count=resolved_request_count,
                unresolved_request_count=unresolved_request_count,
                unresolved_share=unresolved_request_count / total_request_count,
                resolution_rate=resolved_request_count / total_request_count,
            )
        )

    return sorted(
        summaries,
        key=lambda summary: (
            -summary.unresolved_share,
            -summary.total_request_count,
            summary.geography_value,
            summary.complaint_type,
        ),
    )

extract_topics

extract_topics(
    service_requests: list[ServiceRequestRecord],
    query: TopicQuery,
    *,
    custom_rules: TopicRuleSet | None = None,
) -> list[TopicAssignment]

Extract deterministic first-pass topics for one complaint type.

Source code in src/nyc311/analysis/_topics.py
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
def extract_topics(
    service_requests: list[ServiceRequestRecord],
    query: TopicQuery,
    *,
    custom_rules: TopicRuleSet | None = None,
) -> list[TopicAssignment]:
    """Extract deterministic first-pass topics for one complaint type."""
    complaint_type = query.complaint_type
    matching_records = [
        record for record in service_requests if record.complaint_type == complaint_type
    ]
    if not matching_records:
        return []

    rules = _select_topic_rules(complaint_type, custom_rules)
    if rules is None:
        topic_assignments = _extract_fallback_topics(matching_records)
    else:
        topic_assignments = _extract_rule_based_topics(matching_records, rules)
    return _limit_assignments(topic_assignments, top_n=query.top_n)

register_topic_rules

register_topic_rules(
    complaint_type: str, rules: TopicRuleSet
) -> None

Register or replace topic rules for one complaint type.

Source code in src/nyc311/analysis/_topics.py
116
117
118
119
120
121
def register_topic_rules(complaint_type: str, rules: TopicRuleSet) -> None:
    """Register or replace topic rules for one complaint type."""
    normalized_complaint_type = _normalize_value(complaint_type)
    if not normalized_complaint_type:
        raise ValueError("complaint_type must not be empty.")
    _REGISTERED_TOPIC_RULES[normalized_complaint_type] = _normalize_topic_rules(rules)

Geographies

nyc311.geographies

Public access to packaged NYC geography layers and boundary helpers.

boundaries_to_dataframe

boundaries_to_dataframe(
    boundaries: BoundaryCollection,
) -> pd.DataFrame

Convert a typed boundary collection into a DataFrame.

Source code in src/nyc311/geographies/_conversions.py
25
26
27
28
29
30
31
32
33
34
def boundaries_to_dataframe(boundaries: BoundaryCollection) -> pd.DataFrame:
    """Convert a typed boundary collection into a DataFrame."""
    try:
        return toolkit_boundaries_to_dataframe(boundaries)
    except ImportError as exc:  # pragma: no cover - exercised in optional tests
        raise ImportError(
            "pandas is required for nyc311 geography dataframe helpers. "
            "Install it with `pip install nyc311[dataframes]`, "
            "`pip install nyc311[science]`, or `pip install pandas`."
        ) from exc

boundaries_to_geojson

boundaries_to_geojson(
    boundaries: BoundaryCollection,
) -> dict[str, object]

Convert a typed boundary collection into a GeoJSON FeatureCollection.

Source code in src/nyc311/geographies/_conversions.py
20
21
22
def boundaries_to_geojson(boundaries: BoundaryCollection) -> dict[str, object]:
    """Convert a typed boundary collection into a GeoJSON FeatureCollection."""
    return toolkit_boundaries_to_geojson(boundaries)

list_boundary_layers

list_boundary_layers() -> tuple[str, ...]

List the packaged NYC boundary layers shipped with nyc311.

Source code in src/nyc311/geographies/_loaders.py
69
70
71
def list_boundary_layers() -> tuple[str, ...]:
    """List the packaged NYC boundary layers shipped with nyc311."""
    return toolkit_list_boundary_layers()

list_boundary_values

list_boundary_values(layer: str) -> tuple[str, ...]

List the canonical values available for one packaged boundary layer.

Source code in src/nyc311/geographies/_loaders.py
74
75
76
def list_boundary_values(layer: str) -> tuple[str, ...]:
    """List the canonical values available for one packaged boundary layer."""
    return toolkit_list_boundary_values(layer)

load_boundaries

load_boundaries(source: str | Path) -> BoundaryCollection

Load boundaries from a file path or a packaged NYC boundary layer.

Source code in src/nyc311/geographies/_loaders.py
88
89
90
def load_boundaries(source: str | Path) -> BoundaryCollection:
    """Load boundaries from a file path or a packaged NYC boundary layer."""
    return toolkit_load_boundaries(source)

load_nyc_boundaries

load_nyc_boundaries(
    layer: str = "community_district",
    *,
    values: str | tuple[str, ...] | list[str] | None = None,
) -> BoundaryCollection

Load a packaged NYC boundary layer as typed boundary models.

Source code in src/nyc311/geographies/_loaders.py
79
80
81
82
83
84
85
def load_nyc_boundaries(
    layer: str = "community_district",
    *,
    values: str | tuple[str, ...] | list[str] | None = None,
) -> BoundaryCollection:
    """Load a packaged NYC boundary layer as typed boundary models."""
    return toolkit_load_nyc_boundaries(layer, values=values)

load_nyc_boundaries_geodataframe

load_nyc_boundaries_geodataframe(
    layer: str = "community_district",
    *,
    values: str | tuple[str, ...] | list[str] | None = None,
) -> gpd.GeoDataFrame

Load a packaged NYC boundary layer directly into a GeoDataFrame.

Source code in src/nyc311/geographies/_loaders.py
93
94
95
96
97
98
99
def load_nyc_boundaries_geodataframe(
    layer: str = "community_district",
    *,
    values: str | tuple[str, ...] | list[str] | None = None,
) -> gpd.GeoDataFrame:
    """Load a packaged NYC boundary layer directly into a GeoDataFrame."""
    return toolkit_load_nyc_boundaries_geodataframe(layer, values=values)

load_nyc_census_tracts

load_nyc_census_tracts(
    *,
    values: str | tuple[str, ...] | list[str] | None = None,
) -> BoundaryCollection

Load the packaged NYC census-tract layer.

Source code in src/nyc311/geographies/_loaders.py
102
103
104
105
106
107
def load_nyc_census_tracts(
    *,
    values: str | tuple[str, ...] | list[str] | None = None,
) -> BoundaryCollection:
    """Load the packaged NYC census-tract layer."""
    return toolkit_load_nyc_census_tracts(values=values)

load_nyc_council_districts

load_nyc_council_districts(
    *,
    values: str | tuple[str, ...] | list[str] | None = None,
) -> BoundaryCollection

Load the packaged NYC city-council-district layer.

Source code in src/nyc311/geographies/_loaders.py
118
119
120
121
122
123
def load_nyc_council_districts(
    *,
    values: str | tuple[str, ...] | list[str] | None = None,
) -> BoundaryCollection:
    """Load the packaged NYC city-council-district layer."""
    return toolkit_load_nyc_council_districts(values=values)

load_nyc_neighborhood_tabulation_areas

load_nyc_neighborhood_tabulation_areas(
    *,
    values: str | tuple[str, ...] | list[str] | None = None,
) -> BoundaryCollection

Load the packaged NYC neighborhood-tabulation-area layer.

Source code in src/nyc311/geographies/_loaders.py
110
111
112
113
114
115
def load_nyc_neighborhood_tabulation_areas(
    *,
    values: str | tuple[str, ...] | list[str] | None = None,
) -> BoundaryCollection:
    """Load the packaged NYC neighborhood-tabulation-area layer."""
    return toolkit_load_nyc_neighborhood_tabulation_areas(values=values)

clip_boundaries_to_bbox

clip_boundaries_to_bbox(
    boundaries: BoundaryCollection,
    *,
    min_longitude: float,
    min_latitude: float,
    max_longitude: float,
    max_latitude: float,
) -> BoundaryCollection

Clip boundary geometries to a longitude/latitude bounding box.

Source code in src/nyc311/geographies/_ops.py
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
def clip_boundaries_to_bbox(
    boundaries: BoundaryCollection,
    *,
    min_longitude: float,
    min_latitude: float,
    max_longitude: float,
    max_latitude: float,
) -> BoundaryCollection:
    """Clip boundary geometries to a longitude/latitude bounding box."""
    return toolkit_clip_boundaries_to_bbox(
        boundaries,
        min_longitude=min_longitude,
        min_latitude=min_latitude,
        max_longitude=max_longitude,
        max_latitude=max_latitude,
    )

spatially_enrich_records

spatially_enrich_records(
    records: list[ServiceRequestRecord],
    *,
    layer: str = "community_district",
    boundaries: BoundaryCollection | None = None,
) -> gpd.GeoDataFrame

Attach packaged boundary attributes to point-capable service requests.

Source code in src/nyc311/geographies/_ops.py
36
37
38
39
40
41
42
43
44
45
46
47
def spatially_enrich_records(
    records: list[ServiceRequestRecord],
    *,
    layer: str = "community_district",
    boundaries: BoundaryCollection | None = None,
) -> gpd.GeoDataFrame:
    """Attach packaged boundary attributes to point-capable service requests."""
    normalized_layer = normalize_boundary_layer(layer)
    boundary_collection = boundaries or load_nyc_boundaries(normalized_layer)
    boundaries_gdf = _boundary_collection_to_geodataframe(boundary_collection)
    records_gdf = records_to_geodataframe(records)
    return spatial_join_records_to_boundaries(records_gdf, boundaries_gdf)

Samples

nyc311.samples

Packaged sample data helpers for nyc311 examples and tests.

load_sample_boundaries

load_sample_boundaries(
    layer: str = "community_district",
) -> BoundaryCollection

Load the subset of packaged boundaries that overlaps the sample records.

Source code in src/nyc311/samples/_loaders.py
27
28
29
30
31
32
33
34
35
36
37
def load_sample_boundaries(layer: str = "community_district") -> BoundaryCollection:
    """Load the subset of packaged boundaries that overlaps the sample records."""
    normalized_layer = normalize_boundary_layer(layer)
    sample_boundary_values = load_sample_boundary_values()
    values = sample_boundary_values.get(normalized_layer)
    if values is None:
        raise ValueError(
            "No packaged sample boundaries are available for layer "
            f"{normalized_layer!r}."
        )
    return load_nyc_boundaries(normalized_layer, values=values)

load_sample_service_requests

load_sample_service_requests(
    *, filters: ServiceRequestFilter | None = None
) -> list[ServiceRequestRecord]

Load the packaged sample NYC 311 service-request slice.

Source code in src/nyc311/samples/_loaders.py
15
16
17
18
19
20
21
22
23
24
def load_sample_service_requests(
    *,
    filters: ServiceRequestFilter | None = None,
) -> list[ServiceRequestRecord]:
    """Load the packaged sample NYC 311 service-request slice."""
    with sample_service_request_path() as sample_path:
        return load_service_requests_from_csv(
            sample_path,
            filters=filters or ServiceRequestFilter(),
        )

Export

nyc311.export

Public export helpers for nyc311 outputs.

export_anomalies

export_anomalies(
    data: list[AnomalyResult], target: ExportTarget
) -> Path

Export anomaly detections to a CSV file.

Source code in src/nyc311/export/_csv.py
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
def export_anomalies(data: list[AnomalyResult], target: ExportTarget) -> Path:
    """Export anomaly detections to a CSV file."""
    if target.format != "csv":
        raise ValueError(
            "export_anomalies() currently supports only CSV output. "
            f"Got format={target.format!r}."
        )

    output_path = target.output_path
    output_path.parent.mkdir(parents=True, exist_ok=True)

    with output_path.open("w", newline="", encoding="utf-8") as csv_file:
        writer = csv.DictWriter(
            csv_file,
            fieldnames=ANOMALY_COLUMNS,
        )
        writer.writeheader()
        for row in data:
            writer.writerow(
                {
                    "geography": row.geography,
                    "geography_value": row.geography_value,
                    "complaint_type": row.complaint_type,
                    "topic": row.topic,
                    "complaint_count": row.complaint_count,
                    "geography_total_count": row.geography_total_count,
                    "share_of_geography": f"{row.share_of_geography:.6f}",
                    "topic_rank": row.topic_rank,
                    "z_score": f"{row.z_score:.6f}",
                    "is_anomaly": str(row.is_anomaly).lower(),
                    "window_days": row.window_days,
                    "anomaly_threshold": f"{row.anomaly_threshold:.6f}",
                }
            )

    return output_path

export_service_requests_csv

export_service_requests_csv(
    data: list[ServiceRequestRecord], target: ExportTarget
) -> Path

Export loaded service-request records to a reproducible CSV snapshot.

Source code in src/nyc311/export/_csv.py
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
def export_service_requests_csv(
    data: list[ServiceRequestRecord], target: ExportTarget
) -> Path:
    """Export loaded service-request records to a reproducible CSV snapshot."""
    if target.format != "csv":
        raise ValueError(
            "export_service_requests_csv() currently supports only CSV output. "
            f"Got format={target.format!r}."
        )

    output_path = target.output_path
    output_path.parent.mkdir(parents=True, exist_ok=True)

    with output_path.open("w", newline="", encoding="utf-8") as csv_file:
        writer = csv.DictWriter(
            csv_file,
            fieldnames=SERVICE_REQUEST_EXPORT_COLUMNS,
        )
        writer.writeheader()
        for row in data:
            writer.writerow(
                {
                    "unique_key": row.service_request_id,
                    "created_date": row.created_date.isoformat(),
                    "complaint_type": row.complaint_type,
                    "descriptor": row.descriptor,
                    "borough": row.borough,
                    "community_district": row.community_district,
                    "resolution_description": row.resolution_description or "",
                    "closed_date": (
                        "" if row.closed_date is None else row.closed_date.isoformat()
                    ),
                    "latitude": "" if row.latitude is None else row.latitude,
                    "longitude": "" if row.longitude is None else row.longitude,
                }
            )

    return output_path

export_topic_table

export_topic_table(
    data: list[GeographyTopicSummary], target: ExportTarget
) -> Path

Export geography-topic summaries to a CSV file.

Source code in src/nyc311/export/_csv.py
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
def export_topic_table(data: list[GeographyTopicSummary], target: ExportTarget) -> Path:
    """Export geography-topic summaries to a CSV file."""
    if target.format != "csv":
        raise ValueError(
            "export_topic_table() currently supports only CSV output. "
            f"Got format={target.format!r}."
        )

    output_path = target.output_path
    output_path.parent.mkdir(parents=True, exist_ok=True)

    with output_path.open("w", newline="", encoding="utf-8") as csv_file:
        writer = csv.DictWriter(
            csv_file,
            fieldnames=TOPIC_SUMMARY_COLUMNS,
        )
        writer.writeheader()
        for row in data:
            writer.writerow(
                {
                    "geography": row.geography,
                    "geography_value": row.geography_value,
                    "complaint_type": row.complaint_type,
                    "topic": row.topic,
                    "complaint_count": row.complaint_count,
                    "geography_total_count": row.geography_total_count,
                    "share_of_geography": f"{row.share_of_geography:.6f}",
                    "topic_rank": row.topic_rank,
                    "is_dominant_topic": str(row.is_dominant_topic).lower(),
                }
            )

    return output_path

export_geojson

export_geojson(
    data: BoundaryGeoJSONExport, target: ExportTarget
) -> Path

Export supported boundary-backed complaint outputs to GeoJSON.

Source code in src/nyc311/export/_geojson.py
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
def export_geojson(data: BoundaryGeoJSONExport, target: ExportTarget) -> Path:
    """Export supported boundary-backed complaint outputs to GeoJSON."""
    if target.format != "geojson":
        raise ValueError(
            "export_geojson() currently supports only GeoJSON output. "
            f"Got format={target.format!r}."
        )

    output_path = target.output_path
    output_path.parent.mkdir(parents=True, exist_ok=True)

    summary_by_geography = {
        summary.geography_value: summary
        for summary in data.summaries
        if summary.is_dominant_topic
    }
    features: list[dict[str, object]] = []
    for boundary in data.boundaries.features:
        summary = summary_by_geography.get(boundary.geography_value)
        properties: dict[str, object] = {
            "geography": boundary.geography,
            "geography_value": boundary.geography_value,
            **boundary.properties,
        }
        if summary is not None:
            properties.update(
                {
                    "complaint_type": summary.complaint_type,
                    "dominant_topic": summary.topic,
                    "topic_count": summary.complaint_count,
                    "geography_total_count": summary.geography_total_count,
                    "share_of_geography": round(summary.share_of_geography, 6),
                }
            )
        features.append(
            {
                "type": "Feature",
                "geometry": boundary.geometry,
                "properties": properties,
            }
        )

    feature_collection = {"type": "FeatureCollection", "features": features}
    output_path.write_text(
        json.dumps(feature_collection, indent=2, sort_keys=True),
        encoding="utf-8",
    )
    return output_path

export_report_card

export_report_card(
    data: object, target: ExportTarget
) -> Path

Export a markdown report card from summaries, gaps, and anomalies.

Source code in src/nyc311/export/_report.py
 16
 17
 18
 19
 20
 21
 22
 23
 24
 25
 26
 27
 28
 29
 30
 31
 32
 33
 34
 35
 36
 37
 38
 39
 40
 41
 42
 43
 44
 45
 46
 47
 48
 49
 50
 51
 52
 53
 54
 55
 56
 57
 58
 59
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
def export_report_card(data: object, target: ExportTarget) -> Path:
    """Export a markdown report card from summaries, gaps, and anomalies."""
    if target.format not in {"md", "markdown"}:
        raise ValueError(
            "export_report_card() currently supports only markdown output. "
            f"Got format={target.format!r}."
        )

    topic_summaries, resolution_gaps, anomalies = _coerce_report_card_data(data)
    output_path = target.output_path
    output_path.parent.mkdir(parents=True, exist_ok=True)

    geographies = sorted(
        {
            *[summary.geography_value for summary in topic_summaries],
            *[gap.geography_value for gap in resolution_gaps],
            *[anomaly.geography_value for anomaly in anomalies],
        }
    )
    dominant_topics_by_geography: dict[str, list[GeographyTopicSummary]] = {}
    for summary in topic_summaries:
        if summary.is_dominant_topic:
            dominant_topics_by_geography.setdefault(summary.geography_value, []).append(
                summary
            )
    gaps_by_geography: dict[str, list[ResolutionGapSummary]] = {}
    for gap in resolution_gaps:
        gaps_by_geography.setdefault(gap.geography_value, []).append(gap)
    anomalies_by_geography: dict[str, list[AnomalyResult]] = {}
    for anomaly in anomalies:
        anomalies_by_geography.setdefault(anomaly.geography_value, []).append(anomaly)

    sections = ["# NYC311 Report Card", ""]
    for geography_value in geographies:
        sections.append(f"## {geography_value}")
        sections.append("")

        dominant_topics = sorted(
            dominant_topics_by_geography.get(geography_value, []),
            key=lambda summary: (
                -summary.geography_total_count,
                summary.complaint_type,
            ),
        )
        if dominant_topics:
            sections.append("Dominant topic")
            sections.extend(
                [
                    f"- {dominant_topic.complaint_type}: {dominant_topic.topic} "
                    f"({dominant_topic.complaint_count}/{dominant_topic.geography_total_count}, "
                    f"{dominant_topic.share_of_geography:.1%})"
                    for dominant_topic in dominant_topics[:5]
                ]
            )
        else:
            sections.append("Dominant topic")
            sections.append("- No topic summaries available.")
        sections.append("")

        sections.append("Resolution overview")
        geography_gaps = sorted(
            gaps_by_geography.get(geography_value, []),
            key=lambda gap: (-gap.total_request_count, gap.complaint_type),
        )
        if geography_gaps:
            sections.extend(
                [
                    f"- {gap.complaint_type}: resolution rate {gap.resolution_rate:.1%}, "
                    f"unresolved {gap.unresolved_request_count}/{gap.total_request_count}"
                    for gap in geography_gaps[:5]
                ]
            )
        else:
            sections.append("- No resolution gap summaries available.")
        sections.append("")

        sections.append("Anomaly flags")
        flagged_anomalies = [
            anomaly
            for anomaly in sorted(
                anomalies_by_geography.get(geography_value, []),
                key=lambda anomaly: (
                    -abs(anomaly.z_score),
                    anomaly.topic_rank,
                    anomaly.topic,
                ),
            )
            if anomaly.is_anomaly
        ]
        if flagged_anomalies:
            sections.extend(
                [
                    f"- {anomaly.complaint_type} / {anomaly.topic}: "
                    f"count={anomaly.complaint_count}, z={anomaly.z_score:.2f}"
                    for anomaly in flagged_anomalies[:5]
                ]
            )
        else:
            sections.append("- No anomaly flags above the configured threshold.")
        sections.append("")

    output_path.write_text("\n".join(sections).rstrip() + "\n", encoding="utf-8")
    return output_path

Pipeline

nyc311.pipeline

High-level workflow helpers for live fetching and topic-analysis pipelines.

fetch_service_requests

fetch_service_requests(
    *,
    filters: ServiceRequestFilter | None = None,
    socrata_config: SocrataConfig | None = None,
    output: str | Path | None = None,
    cache_dir: Path | str | None = None,
    refresh: bool = False,
    max_cached_records: int | None = None,
) -> list[ServiceRequestRecord]

Fetch a live Socrata slice into memory and optionally stage it as CSV.

This is the intended SDK helper for notebook and workflow users who want to fetch once, inspect records in memory, and only export a local snapshot when they decide the filtered slice is worth keeping.

When cache_dir is set, responses are streamed to a CSV cache first (see :func:nyc311.io.cached_fetch), then loaded—avoid huge slices unless you use chunked analysis on the cache file.

Source code in src/nyc311/pipeline.py
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
def fetch_service_requests(
    *,
    filters: ServiceRequestFilter | None = None,
    socrata_config: SocrataConfig | None = None,
    output: str | Path | None = None,
    cache_dir: Path | str | None = None,
    refresh: bool = False,
    max_cached_records: int | None = None,
) -> list[ServiceRequestRecord]:
    """Fetch a live Socrata slice into memory and optionally stage it as CSV.

    This is the intended SDK helper for notebook and workflow users who want to
    fetch once, inspect records in memory, and only export a local snapshot when
    they decide the filtered slice is worth keeping.

    When ``cache_dir`` is set, responses are streamed to a CSV cache first (see
    :func:`nyc311.io.cached_fetch`), then loaded—avoid huge slices unless you use
    chunked analysis on the cache file.
    """
    records = load_service_requests(
        socrata_config or SocrataConfig(),
        filters=filters,
        cache_dir=cache_dir,
        refresh=refresh,
        max_cached_records=max_cached_records,
    )
    if output is not None:
        export_service_requests_csv(
            records,
            ExportTarget(format="csv", output_path=Path(output)),
        )
    return records

run_topic_pipeline

run_topic_pipeline(
    source: str | Path | SocrataConfig,
    complaint_type: str,
    *,
    geography: str = "community_district",
    filters: ServiceRequestFilter | None = None,
    top_n: int = 20,
    output: str | Path | None = None,
    output_format: str = "csv",
    boundaries: str | Path | None = None,
) -> list[GeographyTopicSummary]

Run the implemented load-extract-aggregate-export topic workflow.

When output is provided, this helper also writes either a CSV or GeoJSON artifact using the same behavior exposed by the current CLI. The aggregated summaries are always returned to support notebook and workflow use cases.

Source code in src/nyc311/pipeline.py
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
def run_topic_pipeline(
    source: str | Path | SocrataConfig,
    complaint_type: str,
    *,
    geography: str = "community_district",
    filters: ServiceRequestFilter | None = None,
    top_n: int = 20,
    output: str | Path | None = None,
    output_format: str = "csv",
    boundaries: str | Path | None = None,
) -> list[GeographyTopicSummary]:
    """Run the implemented load-extract-aggregate-export topic workflow.

    When ``output`` is provided, this helper also writes either a CSV or GeoJSON
    artifact using the same behavior exposed by the current CLI. The aggregated
    summaries are always returned to support notebook and workflow use cases.
    """
    service_request_filter = ServiceRequestFilter(
        start_date=filters.start_date if filters is not None else None,
        end_date=filters.end_date if filters is not None else None,
        geography=filters.geography if filters is not None else None,
        complaint_types=(complaint_type,),
    )
    records = load_service_requests(source, filters=service_request_filter)
    assignments = extract_topics(
        records,
        TopicQuery(complaint_type=complaint_type, top_n=top_n),
    )
    summaries = aggregate_by_geography(assignments, geography=geography)

    if output is None:
        return summaries

    target = ExportTarget(format=output_format, output_path=Path(output))
    if target.format == "csv":
        export_topic_table(summaries, target)
        return summaries
    if target.format != "geojson":
        raise ValueError(
            "run_topic_pipeline() currently supports only csv and geojson output. "
            f"Got format={target.format!r}."
        )
    if boundaries is None:
        raise ValueError(
            "run_topic_pipeline() requires boundaries when format='geojson'."
        )

    boundary_collection = load_boundaries(boundaries)
    export_geojson(
        BoundaryGeoJSONExport(
            boundaries=boundary_collection, summaries=tuple(summaries)
        ),
        target,
    )
    return summaries

bulk_fetch

bulk_fetch(
    *,
    complaint_types: tuple[str, ...] = (),
    start_date: date | str | None = None,
    end_date: date | str | None = None,
    cache_dir: Path | str = Path("data/cache"),
    boroughs: tuple[str, ...] | None = None,
    app_token: str | None = None,
    page_size: int = 5000,
    on_progress: Callable[[str, int, int], None]
    | None = None,
) -> list[Path]

Fetch full-city 311 data split by borough for manageable file sizes.

Downloads are split per-borough so that each CSV stays under a few hundred megabytes. Files are written to cache_dir with deterministic names; subsequent calls skip any borough whose file already exists. Each completed CSV is paired with a .meta.json sidecar containing the row count, SHA-256 checksum, fetch timestamp, and the filter parameters used.

The Socrata $select fragment requests the schema: unique_key, created_date, closed_date, complaint_type, descriptor, borough, community_board, resolution_description, latitude, longitude. closed_date (added in v1.0.1 per random-walks/nyc311#20) is nullable — unresolved complaints serialize it as an empty column — which lets downstream resolution-time / SLA analyses compute closed_date - created_date directly without a second round-trip.

Parameters:

Name Type Description Default
complaint_types tuple[str, ...]

Optional whitelist of complaint types. When empty, every complaint type is included.

()
start_date date | str | None

Inclusive lower bound on created_date. Accepts a datetime.date or an ISO-8601 string.

None
end_date date | str | None

Inclusive upper bound on created_date. Accepts a datetime.date or an ISO-8601 string.

None
cache_dir Path | str

Directory to write per-borough CSV files into. The directory is created on demand.

Path('data/cache')
boroughs tuple[str, ...] | None

Boroughs to include. Defaults to all five.

None
app_token str | None

Socrata app token for higher rate limits.

None
page_size int

Rows per Socrata HTTP request.

5000
on_progress Callable[[str, int, int], None] | None

Optional callback invoked after each HTTP page as on_progress(borough, page_index, page_row_count).

None

Returns:

Type Description
list[Path]

Paths to the completed per-borough CSV files in the order the

list[Path]

boroughs were processed.

Source code in src/nyc311/pipeline.py
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
def bulk_fetch(
    *,
    complaint_types: tuple[str, ...] = (),
    start_date: date | str | None = None,
    end_date: date | str | None = None,
    cache_dir: Path | str = Path("data/cache"),
    boroughs: tuple[str, ...] | None = None,
    app_token: str | None = None,
    page_size: int = 5_000,
    on_progress: Callable[[str, int, int], None] | None = None,
) -> list[Path]:
    """Fetch full-city 311 data split by borough for manageable file sizes.

    Downloads are split per-borough so that each CSV stays under a few
    hundred megabytes. Files are written to ``cache_dir`` with
    deterministic names; subsequent calls skip any borough whose file
    already exists. Each completed CSV is paired with a ``.meta.json``
    sidecar containing the row count, SHA-256 checksum, fetch
    timestamp, and the filter parameters used.

    The Socrata ``$select`` fragment requests the schema:
    ``unique_key, created_date, closed_date, complaint_type,
    descriptor, borough, community_board, resolution_description,
    latitude, longitude``. ``closed_date`` (added in v1.0.1 per
    random-walks/nyc311#20) is nullable — unresolved complaints
    serialize it as an empty column — which lets downstream
    resolution-time / SLA analyses compute
    ``closed_date - created_date`` directly without a second
    round-trip.

    Args:
        complaint_types: Optional whitelist of complaint types. When
            empty, every complaint type is included.
        start_date: Inclusive lower bound on ``created_date``. Accepts a
            ``datetime.date`` or an ISO-8601 string.
        end_date: Inclusive upper bound on ``created_date``. Accepts a
            ``datetime.date`` or an ISO-8601 string.
        cache_dir: Directory to write per-borough CSV files into. The
            directory is created on demand.
        boroughs: Boroughs to include. Defaults to all five.
        app_token: Socrata app token for higher rate limits.
        page_size: Rows per Socrata HTTP request.
        on_progress: Optional callback invoked after each HTTP page as
            ``on_progress(borough, page_index, page_row_count)``.

    Returns:
        Paths to the completed per-borough CSV files in the order the
        boroughs were processed.
    """
    target_boroughs = boroughs or SUPPORTED_BOROUGHS
    cache_path = Path(cache_dir)

    parsed_start = (
        date.fromisoformat(start_date) if isinstance(start_date, str) else start_date
    )
    parsed_end = date.fromisoformat(end_date) if isinstance(end_date, str) else end_date

    config = large_socrata_config(
        page_size=page_size,
        app_token=app_token,
    )

    paths: list[Path] = []
    for borough_name in target_boroughs:
        filters = ServiceRequestFilter(
            start_date=parsed_start,
            end_date=parsed_end,
            geography=GeographyFilter(geography="borough", value=borough_name),
            complaint_types=complaint_types,
        )

        def _on_page(page_idx: int, row_count: int, _boro: str = borough_name) -> None:
            if on_progress is not None:
                on_progress(_boro, page_idx, row_count)

        result_path = cached_fetch(
            config,
            filters,
            cache_dir=cache_path,
            on_page=_on_page,
        )
        paths.append(result_path)

    return paths

DataFrames

nyc311.dataframes

Optional pandas conversion helpers for notebook and data-science workflows.

anomalies_to_dataframe

anomalies_to_dataframe(
    anomalies: list[AnomalyResult],
) -> Any

Convert anomaly results into a DataFrame.

Source code in src/nyc311/dataframes/_analysis.py
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
def anomalies_to_dataframe(anomalies: list[AnomalyResult]) -> Any:
    """Convert anomaly results into a DataFrame."""
    pd = require_pandas()
    return pd.DataFrame.from_records(
        [
            {
                "geography": anomaly.geography,
                "geography_value": anomaly.geography_value,
                "complaint_type": anomaly.complaint_type,
                "topic": anomaly.topic,
                "complaint_count": anomaly.complaint_count,
                "geography_total_count": anomaly.geography_total_count,
                "share_of_geography": anomaly.share_of_geography,
                "topic_rank": anomaly.topic_rank,
                "z_score": anomaly.z_score,
                "is_anomaly": anomaly.is_anomaly,
                "window_days": anomaly.window_days,
                "anomaly_threshold": anomaly.anomaly_threshold,
            }
            for anomaly in anomalies
        ],
        columns=ANOMALY_COLUMNS,
    )

coverage_to_dataframe

coverage_to_dataframe(
    reports: list[TopicCoverageReport],
) -> Any

Convert topic-coverage reports into a DataFrame.

Source code in src/nyc311/dataframes/_analysis.py
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
def coverage_to_dataframe(reports: list[TopicCoverageReport]) -> Any:
    """Convert topic-coverage reports into a DataFrame."""
    pd = require_pandas()
    return pd.DataFrame.from_records(
        [
            {
                "complaint_type": report.complaint_type,
                "total_records": report.total_records,
                "matched_records": report.matched_records,
                "other_records": report.other_records,
                "coverage_rate": report.coverage_rate,
                "top_unmatched_descriptors": list(report.top_unmatched_descriptors),
            }
            for report in reports
        ],
        columns=TOPIC_COVERAGE_COLUMNS,
    )

gaps_to_dataframe

gaps_to_dataframe(gaps: list[ResolutionGapSummary]) -> Any

Convert resolution-gap summaries into a DataFrame.

Source code in src/nyc311/dataframes/_analysis.py
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
def gaps_to_dataframe(gaps: list[ResolutionGapSummary]) -> Any:
    """Convert resolution-gap summaries into a DataFrame."""
    pd = require_pandas()
    return pd.DataFrame.from_records(
        [
            {
                "geography": gap.geography,
                "geography_value": gap.geography_value,
                "complaint_type": gap.complaint_type,
                "total_request_count": gap.total_request_count,
                "resolved_request_count": gap.resolved_request_count,
                "unresolved_request_count": gap.unresolved_request_count,
                "unresolved_share": gap.unresolved_share,
                "resolution_rate": gap.resolution_rate,
            }
            for gap in gaps
        ],
        columns=RESOLUTION_GAP_COLUMNS,
    )

summaries_to_dataframe

summaries_to_dataframe(
    summaries: list[GeographyTopicSummary],
) -> Any

Convert geography-topic summaries into a DataFrame.

Source code in src/nyc311/dataframes/_analysis.py
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
def summaries_to_dataframe(summaries: list[GeographyTopicSummary]) -> Any:
    """Convert geography-topic summaries into a DataFrame."""
    pd = require_pandas()
    return pd.DataFrame.from_records(
        [
            {
                "geography": summary.geography,
                "geography_value": summary.geography_value,
                "complaint_type": summary.complaint_type,
                "topic": summary.topic,
                "complaint_count": summary.complaint_count,
                "geography_total_count": summary.geography_total_count,
                "share_of_geography": summary.share_of_geography,
                "topic_rank": summary.topic_rank,
                "is_dominant_topic": summary.is_dominant_topic,
            }
            for summary in summaries
        ],
        columns=TOPIC_SUMMARY_COLUMNS,
    )

assignments_to_dataframe

assignments_to_dataframe(
    assignments: list[TopicAssignment],
) -> Any

Convert topic assignments into a DataFrame.

Source code in src/nyc311/dataframes/_records.py
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
def assignments_to_dataframe(assignments: list[TopicAssignment]) -> Any:
    """Convert topic assignments into a DataFrame."""
    pd = require_pandas()
    dataframe = pd.DataFrame.from_records(
        [
            {
                "service_request_id": assignment.record.service_request_id,
                "created_date": assignment.record.created_date,
                "complaint_type": assignment.record.complaint_type,
                "descriptor": assignment.record.descriptor,
                "borough": assignment.record.borough,
                "community_district": assignment.record.community_district,
                "resolution_description": assignment.record.resolution_description,
                "latitude": assignment.record.latitude,
                "longitude": assignment.record.longitude,
                "topic": assignment.topic,
                "normalized_text": assignment.normalized_text,
            }
            for assignment in assignments
        ],
        columns=TOPIC_ASSIGNMENT_COLUMNS,
    )
    if "created_date" in dataframe:
        dataframe["created_date"] = pd.to_datetime(dataframe["created_date"])
    return dataframe

dataframe_to_records

dataframe_to_records(
    dataframe: Any,
) -> list[ServiceRequestRecord]

Convert a DataFrame back into typed service-request records.

Source code in src/nyc311/dataframes/_records.py
 45
 46
 47
 48
 49
 50
 51
 52
 53
 54
 55
 56
 57
 58
 59
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
def dataframe_to_records(dataframe: Any) -> list[ServiceRequestRecord]:
    """Convert a DataFrame back into typed service-request records."""
    pd = require_pandas()
    required_columns = set(SERVICE_REQUEST_REQUIRED_DATAFRAME_COLUMNS)
    missing_columns = sorted(required_columns.difference(dataframe.columns))
    if missing_columns:
        missing = ", ".join(missing_columns)
        raise ValueError(
            f"DataFrame is missing required service-request columns: {missing}."
        )

    records: list[ServiceRequestRecord] = []
    for row in dataframe.to_dict(orient="records"):
        raw_created_date = row["created_date"]
        if hasattr(raw_created_date, "to_pydatetime"):
            created_date = raw_created_date.to_pydatetime().date()
        elif isinstance(raw_created_date, date):
            created_date = raw_created_date
        else:
            created_date = date.fromisoformat(str(raw_created_date))

        resolution_description = row.get("resolution_description")
        normalized_resolution = (
            None
            if resolution_description in (None, "") or pd.isna(resolution_description)
            else str(resolution_description)
        )
        raw_closed_date = row.get("closed_date")
        # pd.isna handles pd.NaT directly; it must come first because
        # pd.NaT passes isinstance(x, datetime.date).
        if raw_closed_date is None or pd.isna(raw_closed_date):
            closed_date: date | None = None
        elif hasattr(raw_closed_date, "to_pydatetime"):
            closed_date = raw_closed_date.to_pydatetime().date()
        elif isinstance(raw_closed_date, date):
            closed_date = raw_closed_date
        else:
            closed_date = date.fromisoformat(str(raw_closed_date))
        latitude = row.get("latitude")
        longitude = row.get("longitude")
        records.append(
            ServiceRequestRecord(
                service_request_id=str(row["service_request_id"]),
                created_date=created_date,
                complaint_type=str(row["complaint_type"]),
                descriptor=str(row["descriptor"]),
                borough=str(row["borough"]),
                community_district=str(row["community_district"]),
                resolution_description=normalized_resolution,
                latitude=None if latitude is None or pd.isna(latitude) else latitude,
                longitude=None
                if longitude is None or pd.isna(longitude)
                else longitude,
                closed_date=closed_date,
            )
        )
    return records

records_to_dataframe

records_to_dataframe(
    records: list[ServiceRequestRecord],
) -> Any

Convert service-request records into a notebook-friendly DataFrame.

Source code in src/nyc311/dataframes/_records.py
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
def records_to_dataframe(records: list[ServiceRequestRecord]) -> Any:
    """Convert service-request records into a notebook-friendly DataFrame."""
    pd = require_pandas()
    dataframe = pd.DataFrame.from_records(
        [
            {
                "service_request_id": record.service_request_id,
                "created_date": record.created_date,
                "complaint_type": record.complaint_type,
                "descriptor": record.descriptor,
                "borough": record.borough,
                "community_district": record.community_district,
                "resolution_description": record.resolution_description,
                "closed_date": record.closed_date,
                "latitude": record.latitude,
                "longitude": record.longitude,
            }
            for record in records
        ],
        columns=SERVICE_REQUEST_DATAFRAME_COLUMNS,
    )
    if "created_date" in dataframe:
        dataframe["created_date"] = pd.to_datetime(dataframe["created_date"])
    if "closed_date" in dataframe:
        dataframe["closed_date"] = pd.to_datetime(dataframe["closed_date"])
    return dataframe

resample_and_fill

resample_and_fill(
    dataframe: Any,
    freq: str,
    *,
    method: Literal["zero", "ffill", "bfill"] = "zero",
) -> Any

Resample a DatetimeIndex-indexed frame and fill missing bins.

method='zero' fills missing values with 0 (typical for counts).

Source code in src/nyc311/dataframes/_timeseries.py
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
def resample_and_fill(
    dataframe: Any,
    freq: str,
    *,
    method: Literal["zero", "ffill", "bfill"] = "zero",
) -> Any:
    """Resample a DatetimeIndex-indexed frame and fill missing bins.

    ``method='zero'`` fills missing values with ``0`` (typical for counts).
    """
    pd = require_pandas()
    if dataframe is None or getattr(dataframe, "empty", True):
        return dataframe

    if not isinstance(dataframe.index, pd.DatetimeIndex):
        raise TypeError("resample_and_fill() expects a DatetimeIndex on the DataFrame.")

    freq = _normalize_pandas_freq(freq)
    resampled = dataframe.resample(freq).sum()
    if method == "zero":
        return resampled.fillna(0)
    if method == "ffill":
        return resampled.ffill()
    if method == "bfill":
        return resampled.bfill()
    raise ValueError(f"Unsupported method: {method!r}.")

to_panel

to_panel(
    records: list[ServiceRequestRecord],
    *,
    freq: str = "D",
    geography: str = "borough",
) -> Any

Return a panel of complaint counts indexed by (geography_value, period).

Columns are complaint types. Use .xs("BROOKLYN", level=0) for one area.

Source code in src/nyc311/dataframes/_timeseries.py
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
def to_panel(
    records: list[ServiceRequestRecord],
    *,
    freq: str = "D",
    geography: str = "borough",
) -> Any:
    """Return a panel of complaint counts indexed by ``(geography_value, period)``.

    Columns are complaint types. Use ``.xs("BROOKLYN", level=0)`` for one area.
    """
    pd = require_pandas()
    if not records:
        return pd.DataFrame()

    freq = _normalize_pandas_freq(freq)
    dataframe = records_to_dataframe(records)
    geo_series = [record.geography_value(geography) for record in records]
    dataframe = dataframe.assign(_geography=geo_series)

    counts = (
        dataframe.groupby(
            [
                "_geography",
                pd.Grouper(key="created_date", freq=freq),
                "complaint_type",
            ]
        )
        .size()
        .unstack(fill_value=0)
    )
    counts.index.names = ("geography_value", "created_date")
    return counts.sort_index()

to_timeseries

to_timeseries(
    records: list[ServiceRequestRecord], *, freq: str = "D"
) -> Any

Return complaint counts per period with a :class:~pandas.DatetimeIndex.

Columns are complaint types (wide format). Suitable for .plot(), .rolling(), and .resample().

Source code in src/nyc311/dataframes/_timeseries.py
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
def to_timeseries(
    records: list[ServiceRequestRecord],
    *,
    freq: str = "D",
) -> Any:
    """Return complaint counts per period with a :class:`~pandas.DatetimeIndex`.

    Columns are complaint types (wide format). Suitable for ``.plot()``, ``.rolling()``,
    and ``.resample()``.
    """
    pd = require_pandas()
    if not records:
        return pd.DataFrame()

    freq = _normalize_pandas_freq(freq)
    dataframe = records_to_dataframe(records)
    counts = (
        dataframe.groupby([pd.Grouper(key="created_date", freq=freq), "complaint_type"])
        .size()
        .unstack(fill_value=0)
    )
    counts = counts.sort_index()
    counts.index.name = "created_date"
    return counts

to_topic_timeseries

to_topic_timeseries(
    assignments: list[TopicAssignment], *, freq: str = "D"
) -> Any

Like :func:to_timeseries but aggregates extracted topic labels.

Source code in src/nyc311/dataframes/_timeseries.py
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
def to_topic_timeseries(
    assignments: list[TopicAssignment],
    *,
    freq: str = "D",
) -> Any:
    """Like :func:`to_timeseries` but aggregates extracted topic labels."""
    pd = require_pandas()
    if not assignments:
        return pd.DataFrame()

    freq = _normalize_pandas_freq(freq)
    dataframe = pd.DataFrame(
        {
            "created_date": pd.to_datetime(
                [a.record.created_date for a in assignments]
            ),
            "topic": [a.topic for a in assignments],
        }
    )
    counts = (
        dataframe.groupby([pd.Grouper(key="created_date", freq=freq), "topic"])
        .size()
        .unstack(fill_value=0)
    )
    counts = counts.sort_index()
    counts.index.name = "created_date"
    return counts

Spatial

nyc311.spatial

Optional geospatial helpers built on top of the typed nyc311 models.

The nyc311.spatial module is the GeoDataFrame-flavoured sibling of nyc311.geographies — it loads boundary layers and records as geopandas frames, spatially joins records to boundaries, and materialises typed summaries as map-ready GeoDataFrames.

.. note::

For polygon-centroid points (distance-band spatial weights, Moran's I / LISA, nearest-neighbour joins, choropleth label placement), nyc311 deliberately does not ship a centroid helper in this module. Use upstream instead:

.. code-block:: python

   from nyc_geo_toolkit import (
       centroids_from_boundaries,
       load_nyc_boundaries,
   )

   cbs = load_nyc_boundaries("community_district")
   # representative=True keeps the point inside the polygon —
   # matters for non-convex NYC shorelines.
   points = centroids_from_boundaries(cbs, representative=True)

Shipped as a first-class helper in nyc-geo-toolkit v0.4.0 (on PyPI as v0.4.1 since 2026-04-21). Requires the [spatial] extra on nyc-geo-toolkit for the shapely dependency. See also :func:nyc311.temporal.centroids_from_boundaries, which returns a shapely-free dict[str, (lat, lon)] suitable for direct use with :func:nyc311.temporal.build_distance_weights.

load_boundaries_geodataframe

load_boundaries_geodataframe(
    source: str | Path | BoundaryCollection | None = None,
    *,
    layer: str | None = None,
) -> Any

Load supported boundaries from a path, collection, or packaged layer.

.. note::

Need polygon centroids for spatial weights / Moran's I / label placement? Upstream :func:nyc_geo_toolkit.centroids_from_boundaries (v0.4+) converts any polygon BoundaryCollection into a Point BoundaryCollection, preserving geography / vintage / properties. Pair with representative=True for non-convex polygons. See the :mod:nyc311.spatial module docstring for the full recipe.

Source code in src/nyc311/spatial/_boundaries.py
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
def load_boundaries_geodataframe(
    source: str | Path | BoundaryCollection | None = None,
    *,
    layer: str | None = None,
) -> Any:
    """Load supported boundaries from a path, collection, or packaged layer.

    .. note::

       Need polygon centroids for spatial weights / Moran's I / label
       placement? Upstream :func:`nyc_geo_toolkit.centroids_from_boundaries`
       (v0.4+) converts any polygon ``BoundaryCollection`` into a Point
       ``BoundaryCollection``, preserving geography / vintage / properties.
       Pair with ``representative=True`` for non-convex polygons. See the
       :mod:`nyc311.spatial` module docstring for the full recipe.
    """
    if layer is not None:
        if source is not None:
            raise ValueError("Pass either source or layer, not both.")
        return _load_nyc_boundaries_geodataframe(layer)

    if source is None:
        raise TypeError("load_boundaries_geodataframe() requires source or layer.")
    if isinstance(source, BoundaryCollection):
        return _boundary_collection_to_geodataframe(source)
    if isinstance(source, Path) or Path(source).exists():
        return _boundary_collection_to_geodataframe(load_boundary_collection(source))
    try:
        return _load_nyc_boundaries_geodataframe(str(source))
    except ValueError:
        return _boundary_collection_to_geodataframe(load_boundary_collection(source))

spatial_join_records_to_boundaries

spatial_join_records_to_boundaries(
    records_gdf: Any, boundaries_gdf: Any
) -> Any

Join point records to boundary polygons without clobbering record columns.

Source code in src/nyc311/spatial/_joins.py
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
def spatial_join_records_to_boundaries(records_gdf: Any, boundaries_gdf: Any) -> Any:
    """Join point records to boundary polygons without clobbering record columns."""
    geopandas, _ = require_geospatial_stack()
    aligned_boundaries = boundaries_gdf
    if (
        getattr(records_gdf, "crs", None)
        and getattr(boundaries_gdf, "crs", None)
        and records_gdf.crs != boundaries_gdf.crs
    ):
        aligned_boundaries = boundaries_gdf.to_crs(records_gdf.crs)

    renamed_boundaries = aligned_boundaries.rename(
        columns={
            column_name: f"boundary_{column_name}"
            for column_name in aligned_boundaries.columns
            if column_name != "geometry"
        }
    )
    joined = geopandas.sjoin(
        records_gdf,
        renamed_boundaries,
        how="left",
        predicate="within",
    )
    if "index_right" in joined.columns:
        joined = joined.drop(columns="index_right")
    return joined

records_to_geodataframe

records_to_geodataframe(
    records: list[ServiceRequestRecord],
) -> Any

Convert point-capable service-request records into a GeoDataFrame.

Source code in src/nyc311/spatial/_points.py
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
def records_to_geodataframe(records: list[ServiceRequestRecord]) -> Any:
    """Convert point-capable service-request records into a GeoDataFrame."""
    geopandas, _ = require_geospatial_stack()
    records_with_coordinates = [
        record
        for record in records
        if record.latitude is not None and record.longitude is not None
    ]
    if not records_with_coordinates:
        return geopandas.GeoDataFrame(
            columns=(*SERVICE_REQUEST_DATAFRAME_COLUMNS, "geometry"),
            geometry="geometry",
            crs="EPSG:4326",
        )

    dataframe = records_to_dataframe(records_with_coordinates).copy()
    return geopandas.GeoDataFrame(
        dataframe,
        geometry=geopandas.points_from_xy(
            dataframe["longitude"],
            dataframe["latitude"],
        ),
        crs="EPSG:4326",
    )

summaries_to_geodataframe

summaries_to_geodataframe(
    summaries: list[Any],
    boundaries_gdf: Any = None,
    *,
    layer: str | None = None,
) -> Any

Merge aggregated geography summaries onto boundary geometries.

Source code in src/nyc311/spatial/_summaries.py
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
def summaries_to_geodataframe(
    summaries: list[Any],
    boundaries_gdf: Any = None,
    *,
    layer: str | None = None,
) -> Any:
    """Merge aggregated geography summaries onto boundary geometries."""
    geopandas, _ = require_geospatial_stack()
    if boundaries_gdf is None:
        if layer is None:
            if not summaries:
                raise ValueError(
                    "summaries_to_geodataframe() requires boundaries_gdf or layer "
                    "when summaries is empty."
                )
            layer = summaries[0].geography
        boundaries_gdf = load_boundaries_geodataframe(layer=layer)
    if "geography" not in boundaries_gdf.columns:
        raise ValueError("boundaries_gdf must include a geography column.")
    if "geography_value" not in boundaries_gdf.columns:
        raise ValueError("boundaries_gdf must include a geography_value column.")

    summary_dataframe = summaries_to_dataframe(summaries)
    merged = boundaries_gdf.merge(
        summary_dataframe,
        on=["geography", "geography_value"],
        how="left",
    )
    return geopandas.GeoDataFrame(merged, geometry="geometry", crs=boundaries_gdf.crs)

Plotting

nyc311.plotting

Optional in-memory plotting helpers for NYC boundary maps.

plot_boundary_choropleth

plot_boundary_choropleth(
    geodataframe: Any,
    *,
    column: str,
    title: str,
    cmap: str = "viridis",
    categorical: bool = False,
    add_basemap: bool = False,
    figsize: tuple[float, float] = (10, 8),
    outline_gdf: Any | None = None,
    legend_title: str | None = None,
    legend_kwds: dict[str, Any] | None = None,
) -> Any

Render a choropleth map and return the matplotlib figure.

Source code in src/nyc311/plotting.py
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
def plot_boundary_choropleth(
    geodataframe: Any,
    *,
    column: str,
    title: str,
    cmap: str = "viridis",
    categorical: bool = False,
    add_basemap: bool = False,
    figsize: tuple[float, float] = (10, 8),
    outline_gdf: Any | None = None,
    legend_title: str | None = None,
    legend_kwds: dict[str, Any] | None = None,
) -> Any:
    """Render a choropleth map and return the matplotlib figure."""
    plt = _require_matplotlib()
    plt.style.use("seaborn-v0_8-whitegrid")
    plot_gdf = _prepare_plot_frame(geodataframe, add_basemap=add_basemap)
    outline_frame = _prepare_plot_frame(outline_gdf, add_basemap=add_basemap)
    if plot_gdf is None:
        raise TypeError("plot_boundary_choropleth() requires a geodataframe.")
    _figure, axes = plt.subplots(figsize=figsize)
    if categorical:
        effective_legend_kwds: dict[str, Any] = {
            "loc": "upper left",
            "bbox_to_anchor": (1.02, 1),
            "frameon": True,
            "borderaxespad": 0.0,
        }
    else:
        # Continuous choropleth uses a matplotlib colorbar (not a legend).
        effective_legend_kwds = {"shrink": 0.72, "label": column}
    if legend_kwds is not None:
        effective_legend_kwds.update(legend_kwds)
    missing_mask = plot_gdf[column].isna()
    missing_frame = plot_gdf[missing_mask]
    data_frame = plot_gdf[~missing_mask]
    if not data_frame.empty:
        data_frame.plot(
            ax=axes,
            column=column,
            legend=True,
            cmap=cmap,
            categorical=categorical,
            edgecolor="#334155",
            linewidth=0.7,
            alpha=0.7 if add_basemap else 1.0,
            legend_kwds=effective_legend_kwds,
        )
    if not missing_frame.empty:
        missing_frame.plot(
            ax=axes,
            color="#d4d4d8",
            edgecolor="#94a3b8",
            linewidth=0.7,
        )
        legend = axes.get_legend()
        if categorical and legend is not None:
            matplotlib_patches = import_module("matplotlib.patches")
            handles = list(legend.legend_handles)
            labels = [text.get_text() for text in legend.get_texts()]
            handles.append(
                matplotlib_patches.Patch(
                    facecolor="#d4d4d8",
                    edgecolor="#94a3b8",
                    label="No data",
                )
            )
            labels.append("No data")
            legend.remove()
            axes.legend(handles, labels, **effective_legend_kwds)
    if outline_frame is not None:
        outline_frame.boundary.plot(
            ax=axes,
            color="#0f172a",
            linewidth=1.15 if not add_basemap else 0.9,
            alpha=0.75,
        )
    axes.set_axis_off()
    if add_basemap:
        contextily = _require_contextily()
        contextily.add_basemap(
            axes,
            source=contextily.providers.CartoDB.Positron,
            attribution_size=6,
        )
    _style_legend(axes, title=legend_title)
    _finish_axes(axes, title=title)
    return axes.figure

plot_boundary_preview

plot_boundary_preview(
    boundaries_gdf: Any,
    *,
    title: str,
    points_gdf: Any | None = None,
    add_basemap: bool = False,
    figsize: tuple[float, float] = (10, 8),
) -> Any

Render boundary outlines and optional points, then return the figure.

Source code in src/nyc311/plotting.py
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
def plot_boundary_preview(
    boundaries_gdf: Any,
    *,
    title: str,
    points_gdf: Any | None = None,
    add_basemap: bool = False,
    figsize: tuple[float, float] = (10, 8),
) -> Any:
    """Render boundary outlines and optional points, then return the figure."""
    plt = _require_matplotlib()
    plt.style.use("seaborn-v0_8-whitegrid")
    boundary_frame = _prepare_plot_frame(boundaries_gdf, add_basemap=add_basemap)
    point_frame = _prepare_plot_frame(points_gdf, add_basemap=add_basemap)
    if boundary_frame is None:
        raise TypeError("plot_boundary_preview() requires boundaries_gdf.")

    _figure, axes = plt.subplots(figsize=figsize)
    boundary_frame.boundary.plot(
        ax=axes,
        color="#1f2937",
        linewidth=1.25,
    )
    if point_frame is not None and not point_frame.empty:
        point_frame.plot(
            ax=axes,
            color="#dc2626",
            markersize=18,
            alpha=0.8,
        )
    if add_basemap:
        contextily = _require_contextily()
        contextily.add_basemap(
            axes,
            source=contextily.providers.CartoDB.Positron,
            attribution_size=6,
        )
    _finish_axes(axes, title=title)
    return axes.figure

plot_boundary_point_groups

plot_boundary_point_groups(
    boundaries_gdf: Any,
    *,
    title: str,
    matched_points_gdf: Any | None = None,
    unmatched_points_gdf: Any | None = None,
    context_gdf: Any | None = None,
    outline_gdf: Any | None = None,
    matched_label: str = "Matched",
    unmatched_label: str = "Unmatched",
    add_basemap: bool = False,
    figsize: tuple[float, float] = (10, 8),
) -> Any

Render categorized points over highlighted boundaries and optional context.

Source code in src/nyc311/plotting.py
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
def plot_boundary_point_groups(
    boundaries_gdf: Any,
    *,
    title: str,
    matched_points_gdf: Any | None = None,
    unmatched_points_gdf: Any | None = None,
    context_gdf: Any | None = None,
    outline_gdf: Any | None = None,
    matched_label: str = "Matched",
    unmatched_label: str = "Unmatched",
    add_basemap: bool = False,
    figsize: tuple[float, float] = (10, 8),
) -> Any:
    """Render categorized points over highlighted boundaries and optional context."""
    plt = _require_matplotlib()
    plt.style.use("seaborn-v0_8-whitegrid")
    boundary_frame = _prepare_plot_frame(boundaries_gdf, add_basemap=add_basemap)
    context_frame = _prepare_plot_frame(context_gdf, add_basemap=add_basemap)
    outline_frame = _prepare_plot_frame(outline_gdf, add_basemap=add_basemap)
    matched_frame = _prepare_plot_frame(matched_points_gdf, add_basemap=add_basemap)
    unmatched_frame = _prepare_plot_frame(unmatched_points_gdf, add_basemap=add_basemap)
    if boundary_frame is None:
        raise TypeError("plot_boundary_point_groups() requires boundaries_gdf.")

    figure, axes = plt.subplots(figsize=figsize)
    if context_frame is not None and not context_frame.empty:
        context_frame.plot(
            ax=axes,
            color="#f1f5f9",
            edgecolor="#cbd5e1",
            linewidth=0.5,
        )
    boundary_frame.boundary.plot(
        ax=axes,
        color="#334155",
        linewidth=1.25,
    )
    if outline_frame is not None and not outline_frame.empty:
        outline_frame.boundary.plot(
            ax=axes,
            color="#0f172a",
            linewidth=1.15 if not add_basemap else 0.9,
            alpha=0.75,
        )
    if matched_frame is not None and not matched_frame.empty:
        matched_style = _point_style(len(matched_frame), matched=True)
        matched_frame.plot(
            ax=axes,
            color="#16a34a",
            markersize=matched_style["markersize"],
            alpha=matched_style["alpha"],
            label=matched_label,
        )
    if unmatched_frame is not None and not unmatched_frame.empty:
        unmatched_style = _point_style(len(unmatched_frame), matched=False)
        unmatched_frame.plot(
            ax=axes,
            color="#dc2626",
            markersize=unmatched_style["markersize"],
            marker="x",
            linewidth=unmatched_style["linewidth"],
            alpha=unmatched_style["alpha"],
            label=unmatched_label,
        )
    if add_basemap:
        contextily = _require_contextily()
        contextily.add_basemap(
            axes,
            source=contextily.providers.CartoDB.Positron,
            attribution_size=6,
        )
    legend_handles, _legend_labels = axes.get_legend_handles_labels()
    if legend_handles:
        axes.legend(loc="lower left", frameon=True)
        _style_legend(axes)
    _finish_axes(axes, title=title)
    return figure

plot_timeseries

plot_timeseries(
    dataframe: Any,
    *,
    title: str,
    figsize: tuple[float, float] = (12, 5),
    footnote: str | None = None,
) -> Any

Line chart for a :class:~pandas.DataFrame with a DatetimeIndex or created_date column.

Source code in src/nyc311/plotting.py
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
def plot_timeseries(
    dataframe: Any,
    *,
    title: str,
    figsize: tuple[float, float] = (12, 5),
    footnote: str | None = None,
) -> Any:
    """Line chart for a :class:`~pandas.DataFrame` with a DatetimeIndex or ``created_date`` column."""
    plt = _require_matplotlib()
    pd = import_module("pandas")
    plt.style.use("seaborn-v0_8-whitegrid")
    _figure, axes = plt.subplots(figsize=figsize)
    plot_df = dataframe
    if isinstance(dataframe.index, pd.DatetimeIndex):
        plot_df = dataframe
    elif "created_date" in getattr(dataframe, "columns", ()):
        plot_df = dataframe.set_index("created_date").sort_index()
    else:
        plot_df = dataframe.copy()
    plot_df.plot(ax=axes, legend=True)
    axes.set_title(title, pad=12)
    axes.set_xlabel("")
    axes.grid(True, alpha=0.3)
    axes.figure.patch.set_facecolor("white")
    if footnote:
        fig = axes.figure
        fig.subplots_adjust(bottom=0.16)
        fig.text(
            0.5,
            0.02,
            footnote,
            ha="center",
            fontsize=8,
            color="#555",
            va="bottom",
            wrap=True,
        )
    return axes.figure

plot_complaint_heatmap

plot_complaint_heatmap(
    dataframe: Any,
    *,
    title: str,
    time_column: str = "created_date",
    figsize: tuple[float, float] = (10, 6),
) -> Any

Hour-of-day x day-of-week density heatmap (expects datetime resolution in time_column).

Source code in src/nyc311/plotting.py
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
def plot_complaint_heatmap(
    dataframe: Any,
    *,
    title: str,
    time_column: str = "created_date",
    figsize: tuple[float, float] = (10, 6),
) -> Any:
    """Hour-of-day x day-of-week density heatmap (expects datetime resolution in ``time_column``)."""
    plt = _require_matplotlib()
    pd = import_module("pandas")
    np = import_module("numpy")
    plt.style.use("seaborn-v0_8-whitegrid")
    if time_column not in dataframe.columns:
        raise ValueError(f"DataFrame must include column {time_column!r}.")

    times = pd.to_datetime(dataframe[time_column])
    hour = times.dt.hour
    weekday = times.dt.dayofweek
    grid = (
        pd.DataFrame({"hour": hour, "weekday": weekday})
        .assign(n=1)
        .groupby(["weekday", "hour"], observed=False)["n"]
        .sum()
        .unstack(fill_value=0)
        .reindex(index=range(7), columns=range(24), fill_value=0)
    )
    labels = ["Mon", "Tue", "Wed", "Thu", "Fri", "Sat", "Sun"]
    _figure, axes = plt.subplots(figsize=figsize)
    im = axes.imshow(np.asarray(grid), aspect="auto", cmap="YlOrRd", origin="lower")
    axes.set_xticks(range(0, 24, 2))
    axes.set_yticks(range(7))
    axes.set_yticklabels(labels)
    axes.set_xlabel("Hour of day")
    axes.set_ylabel("Weekday")
    axes.set_title(title, pad=12)
    plt.colorbar(im, ax=axes, fraction=0.046, pad=0.04, label="Complaints")
    axes.figure.patch.set_facecolor("white")
    return axes.figure

plot_stacked_area

plot_stacked_area(
    dataframe: Any,
    *,
    title: str,
    top_n: int = 8,
    figsize: tuple[float, float] = (12, 6),
) -> Any

Stacked area chart of the top-N columns (by total) over a DatetimeIndex.

Source code in src/nyc311/plotting.py
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
def plot_stacked_area(
    dataframe: Any,
    *,
    title: str,
    top_n: int = 8,
    figsize: tuple[float, float] = (12, 6),
) -> Any:
    """Stacked area chart of the top-N columns (by total) over a DatetimeIndex."""
    plt = _require_matplotlib()
    pd = import_module("pandas")
    plt.style.use("seaborn-v0_8-whitegrid")
    if not isinstance(dataframe.index, pd.DatetimeIndex):
        raise TypeError(
            "plot_stacked_area() expects a DatetimeIndex-indexed DataFrame."
        )
    totals = dataframe.sum().sort_values(ascending=False)
    cols = list(totals.head(top_n).index)
    sub = dataframe[cols].fillna(0)
    if sub.shape[1] == 0:
        sub = dataframe.fillna(0)
    mdates = import_module("matplotlib.dates")
    _figure, axes = plt.subplots(figsize=figsize)
    xnum = mdates.date2num(pd.DatetimeIndex(sub.index).to_pydatetime())
    axes.stackplot(
        xnum,
        *[sub[c].to_numpy() for c in sub.columns],
        labels=list(sub.columns),
        alpha=0.85,
    )
    axes.xaxis_date()
    ax_fig = axes.figure
    ax_fig.autofmt_xdate()
    axes.legend(loc="upper left", bbox_to_anchor=(1.02, 1), frameon=True)
    axes.set_title(title, pad=12)
    axes.set_xlabel("")
    axes.grid(True, alpha=0.25)
    axes.figure.patch.set_facecolor("white")
    return axes.figure

plot_bar_counts

plot_bar_counts(
    labels: list[str],
    counts: list[float],
    *,
    title: str,
    horizontal: bool = False,
    figsize: tuple[float, float] = (10, 6),
) -> Any

Simple bar chart for categorical counts.

Source code in src/nyc311/plotting.py
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
def plot_bar_counts(
    labels: list[str],
    counts: list[float],
    *,
    title: str,
    horizontal: bool = False,
    figsize: tuple[float, float] = (10, 6),
) -> Any:
    """Simple bar chart for categorical counts."""
    plt = _require_matplotlib()
    plt.style.use("seaborn-v0_8-whitegrid")
    _figure, axes = plt.subplots(figsize=figsize)
    if horizontal:
        axes.barh(labels, counts, color="#3b82f6", edgecolor="#1e40af", linewidth=0.5)
    else:
        axes.bar(labels, counts, color="#3b82f6", edgecolor="#1e40af", linewidth=0.5)
        plt.setp(axes.xaxis.get_majorticklabels(), rotation=45, ha="right")
    axes.set_title(title, pad=12)
    axes.grid(True, axis="y", alpha=0.3)
    axes.figure.patch.set_facecolor("white")
    return axes.figure

plot_complaint_scatter

plot_complaint_scatter(
    points_gdf: Any,
    *,
    boundaries_gdf: Any | None = None,
    title: str,
    column: str = "complaint_type",
    add_basemap: bool = False,
    figsize: tuple[float, float] = (12, 10),
    legend_top_n: int | None = None,
) -> Any

Scatter plot of points colored by column over optional boundary outlines.

Source code in src/nyc311/plotting.py
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
def plot_complaint_scatter(
    points_gdf: Any,
    *,
    boundaries_gdf: Any | None = None,
    title: str,
    column: str = "complaint_type",
    add_basemap: bool = False,
    figsize: tuple[float, float] = (12, 10),
    legend_top_n: int | None = None,
) -> Any:
    """Scatter plot of points colored by ``column`` over optional boundary outlines."""
    plt = _require_matplotlib()
    plt.style.use("seaborn-v0_8-whitegrid")
    point_frame = _prepare_plot_frame(points_gdf, add_basemap=add_basemap)
    boundary_frame = _prepare_plot_frame(boundaries_gdf, add_basemap=add_basemap)
    if point_frame is None or point_frame.empty:
        raise TypeError(
            "plot_complaint_scatter() requires a non-empty points GeoDataFrame."
        )

    _figure, axes = plt.subplots(figsize=figsize)
    scatter_legend_kwds = {"bbox_to_anchor": (1.02, 1), "loc": "upper left"}
    if boundary_frame is not None and not boundary_frame.empty:
        boundary_frame.boundary.plot(ax=axes, color="#0f172a", linewidth=0.8, alpha=0.7)
    point_frame.plot(
        ax=axes,
        column=column,
        legend=True,
        markersize=12,
        alpha=0.5,
        categorical=True,
        cmap="tab20",
        legend_kwds=scatter_legend_kwds,
    )
    if add_basemap:
        contextily = _require_contextily()
        contextily.add_basemap(
            axes,
            source=contextily.providers.CartoDB.Positron,
            attribution_size=6,
        )
    if legend_top_n is not None:
        _apply_top_n_categorical_point_legend(
            axes,
            point_frame=point_frame,
            column=column,
            top_n=legend_top_n,
            legend_kwds=scatter_legend_kwds,
        )
    else:
        _style_legend(axes)
    _finish_axes(axes, title=title)
    return axes.figure

plot_hero_banner

plot_hero_banner(
    points_gdf: Any,
    *,
    boundaries_gdf: Any | None = None,
    title: str,
    bbox: tuple[float, float, float, float] | None = None,
    column: str = "complaint_type",
    figsize: tuple[float, float] = (16, 5),
    legend_top_n: int | None = None,
) -> Any

Wide horizontal map with OSM basemap, points, and boundaries (Web Mercator).

Source code in src/nyc311/plotting.py
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
def plot_hero_banner(
    points_gdf: Any,
    *,
    boundaries_gdf: Any | None = None,
    title: str,
    bbox: tuple[float, float, float, float] | None = None,
    column: str = "complaint_type",
    figsize: tuple[float, float] = (16, 5),
    legend_top_n: int | None = None,
) -> Any:
    """Wide horizontal map with OSM basemap, points, and boundaries (Web Mercator)."""
    plt = _require_matplotlib()
    plt.style.use("seaborn-v0_8-whitegrid")
    point_frame = points_gdf
    boundary_frame = boundaries_gdf
    if bbox is not None:
        minx, miny, maxx, maxy = bbox
        # GeoPandas uses float coordinate bounds; slice() in typeshed is int-only.
        x_slice = slice(cast(Any, minx), cast(Any, maxx))
        y_slice = slice(cast(Any, miny), cast(Any, maxy))
        point_frame = points_gdf.cx[x_slice, y_slice]
        if boundaries_gdf is not None:
            boundary_frame = boundaries_gdf.cx[x_slice, y_slice]

    point_frame = _prepare_plot_frame(point_frame, add_basemap=True)
    boundary_frame = _prepare_plot_frame(boundary_frame, add_basemap=True)
    if point_frame is None or point_frame.empty:
        raise TypeError("plot_hero_banner() requires a non-empty points GeoDataFrame.")

    _figure, axes = plt.subplots(figsize=figsize)
    hero_legend_kwds = {"bbox_to_anchor": (1.01, 1), "loc": "upper left", "fontsize": 8}
    if boundary_frame is not None and not boundary_frame.empty:
        boundary_frame.boundary.plot(
            ax=axes, color="#0f172a", linewidth=0.9, alpha=0.85
        )
    point_frame.plot(
        ax=axes,
        column=column,
        legend=True,
        markersize=8,
        alpha=0.65,
        categorical=True,
        cmap="tab20",
        legend_kwds=hero_legend_kwds,
    )
    contextily = _require_contextily()
    contextily.add_basemap(
        axes,
        source=contextily.providers.CartoDB.Positron,
        attribution_size=5,
    )
    if legend_top_n is not None:
        _apply_top_n_categorical_point_legend(
            axes,
            point_frame=point_frame,
            column=column,
            top_n=legend_top_n,
            legend_kwds=hero_legend_kwds,
        )
    else:
        _style_legend(axes)
    axes.set_axis_off()
    axes.set_title(title, pad=10, fontsize=14, fontweight="600")
    axes.figure.patch.set_facecolor("white")
    return axes.figure

Presets

nyc311.presets

Reusable preset builders for common nyc311 example and workflow inputs.

build_filter

build_filter(
    *,
    start_date: date | str,
    end_date: date | str,
    geography: str = "borough",
    geography_value: str = models.BOROUGH_BROOKLYN,
    complaint_types: tuple[str, ...] = (),
) -> models.ServiceRequestFilter

Build a typed service-request filter from string-friendly inputs.

Source code in src/nyc311/presets.py
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
def build_filter(
    *,
    start_date: date | str,
    end_date: date | str,
    geography: str = "borough",
    geography_value: str = models.BOROUGH_BROOKLYN,
    complaint_types: tuple[str, ...] = (),
) -> models.ServiceRequestFilter:
    """Build a typed service-request filter from string-friendly inputs."""
    return models.ServiceRequestFilter(
        start_date=_coerce_date(start_date),
        end_date=_coerce_date(end_date),
        geography=models.GeographyFilter(geography, geography_value),
        complaint_types=complaint_types,
    )

brooklyn_borough_filter

brooklyn_borough_filter(
    *,
    start_date: date | str,
    end_date: date | str,
    complaint_types: tuple[str, ...] = (),
) -> models.ServiceRequestFilter

Build a borough-level Brooklyn filter.

Source code in src/nyc311/presets.py
32
33
34
35
36
37
38
39
40
41
42
43
44
45
def brooklyn_borough_filter(
    *,
    start_date: date | str,
    end_date: date | str,
    complaint_types: tuple[str, ...] = (),
) -> models.ServiceRequestFilter:
    """Build a borough-level Brooklyn filter."""
    return build_filter(
        start_date=start_date,
        end_date=end_date,
        geography="borough",
        geography_value=models.BOROUGH_BROOKLYN,
        complaint_types=complaint_types,
    )

manhattan_borough_filter

manhattan_borough_filter(
    *,
    start_date: date | str,
    end_date: date | str,
    complaint_types: tuple[str, ...] = (),
) -> models.ServiceRequestFilter

Build a borough-level Manhattan filter.

Source code in src/nyc311/presets.py
48
49
50
51
52
53
54
55
56
57
58
59
60
61
def manhattan_borough_filter(
    *,
    start_date: date | str,
    end_date: date | str,
    complaint_types: tuple[str, ...] = (),
) -> models.ServiceRequestFilter:
    """Build a borough-level Manhattan filter."""
    return build_filter(
        start_date=start_date,
        end_date=end_date,
        geography="borough",
        geography_value=models.BOROUGH_MANHATTAN,
        complaint_types=complaint_types,
    )

small_socrata_config

small_socrata_config(
    *,
    page_size: int = 500,
    max_pages: int | None = 1,
    app_token: str | None = None,
) -> models.SocrataConfig

Build a small Socrata config suited to examples and local iteration.

Source code in src/nyc311/presets.py
64
65
66
67
68
69
70
71
72
73
74
75
def small_socrata_config(
    *,
    page_size: int = 500,
    max_pages: int | None = 1,
    app_token: str | None = None,
) -> models.SocrataConfig:
    """Build a small Socrata config suited to examples and local iteration."""
    return models.SocrataConfig(
        app_token=app_token,
        page_size=page_size,
        max_pages=max_pages,
    )

large_socrata_config

large_socrata_config(
    *,
    page_size: int = 5000,
    max_pages: int | None = None,
    app_token: str | None = None,
    request_timeout_seconds: float = 300.0,
    created_date_sort: Literal["asc", "desc"] = "asc",
) -> models.SocrataConfig

Build a high-throughput Socrata config for bulk downloads (e.g. full history).

Default page_size is 5,000 rows per request so each HTTP round-trip stays smaller than very large pages, with a five-minute read timeout per request. Use created_date_sort='desc' when you want the most recent rows first (e.g. capped smoke samples).

Source code in src/nyc311/presets.py
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
def large_socrata_config(
    *,
    page_size: int = 5_000,
    max_pages: int | None = None,
    app_token: str | None = None,
    request_timeout_seconds: float = 300.0,
    created_date_sort: Literal["asc", "desc"] = "asc",
) -> models.SocrataConfig:
    """Build a high-throughput Socrata config for bulk downloads (e.g. full history).

    Default ``page_size`` is 5,000 rows per request so each HTTP round-trip stays
    smaller than very large pages, with a five-minute read timeout per request.
    Use ``created_date_sort='desc'`` when you want the most recent rows first
    (e.g. capped smoke samples).
    """
    return models.SocrataConfig(
        app_token=app_token,
        page_size=page_size,
        max_pages=max_pages,
        request_timeout_seconds=request_timeout_seconds,
        created_date_sort=created_date_sort,
    )

smoke_socrata_config

smoke_socrata_config(
    *,
    page_size: int = 5000,
    app_token: str | None = None,
    request_timeout_seconds: float = 120.0,
) -> models.SocrataConfig

Recent-first Socrata config used with a per-borough row cap (see about-the-data --preset smoke).

Source code in src/nyc311/presets.py
102
103
104
105
106
107
108
109
110
111
112
113
114
115
def smoke_socrata_config(
    *,
    page_size: int = 5_000,
    app_token: str | None = None,
    request_timeout_seconds: float = 120.0,
) -> models.SocrataConfig:
    """Recent-first Socrata config used with a per-borough row cap (see about-the-data ``--preset smoke``)."""
    return models.SocrataConfig(
        app_token=app_token,
        page_size=page_size,
        max_pages=None,
        request_timeout_seconds=request_timeout_seconds,
        created_date_sort="desc",
    )

Factors

nyc311.factors

Composable factor pipeline for NYC 311 complaint analysis.

EquityGapFactor

Bases: Factor

Disparity metric: ratio of unit resolution time to citywide median.

Values above 1.0 indicate the unit resolves complaints slower than the citywide median; below 1.0, faster.

Source code in src/nyc311/factors/_advanced.py
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
class EquityGapFactor(Factor):
    """Disparity metric: ratio of unit resolution time to citywide median.

    Values above 1.0 indicate the unit resolves complaints slower
    than the citywide median; below 1.0, faster.
    """

    name = "equity_gap"
    dtype = "float"

    def __init__(self, citywide_median_days: float) -> None:
        """Initialize the equity gap factor.

        Args:
            citywide_median_days: The citywide median resolution
                time in days, used as the denominator for the ratio.
        """
        self._citywide_median = citywide_median_days

    def compute(self, context: FactorContext) -> float:
        """Return the resolution-time equity ratio for ``context``.

        Returns:
            ``unit_median / citywide_median``, or ``0.0`` when no
            resolved complaints exist or the citywide median is
            non-positive.
        """
        resolved = [
            c for c in context.complaints if c.resolution_description is not None
        ]
        if not resolved or self._citywide_median <= 0:
            return 0.0
        days = [
            max(float((context.time_window_end - c.created_date).days), 0.0)
            for c in resolved
        ]
        unit_median = median(days)
        return unit_median / self._citywide_median

name class-attribute instance-attribute

name = 'equity_gap'

dtype class-attribute instance-attribute

dtype = 'float'

compute

compute(context: FactorContext) -> float

Return the resolution-time equity ratio for context.

Returns:

Type Description
float

unit_median / citywide_median, or 0.0 when no

float

resolved complaints exist or the citywide median is

float

non-positive.

Source code in src/nyc311/factors/_advanced.py
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
def compute(self, context: FactorContext) -> float:
    """Return the resolution-time equity ratio for ``context``.

    Returns:
        ``unit_median / citywide_median``, or ``0.0`` when no
        resolved complaints exist or the citywide median is
        non-positive.
    """
    resolved = [
        c for c in context.complaints if c.resolution_description is not None
    ]
    if not resolved or self._citywide_median <= 0:
        return 0.0
    days = [
        max(float((context.time_window_end - c.created_date).days), 0.0)
        for c in resolved
    ]
    unit_median = median(days)
    return unit_median / self._citywide_median

SpatialLagFactor

Bases: Factor

Spatial lag of complaint counts: weighted average of neighbors.

Uses a precomputed spatial weights dict and a values dict to compute the weighted sum of neighboring unit values for the focal unit.

Source code in src/nyc311/factors/_advanced.py
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
class SpatialLagFactor(Factor):
    """Spatial lag of complaint counts: weighted average of neighbors.

    Uses a precomputed spatial weights dict and a values dict to
    compute the weighted sum of neighboring unit values for the
    focal unit.
    """

    name = "spatial_lag"
    dtype = "float"

    def __init__(
        self,
        weights: dict[str, dict[str, float]],
        values: dict[str, float],
    ) -> None:
        """Initialize the spatial lag factor.

        Args:
            weights: Nested dict ``{unit_a: {unit_b: weight}}`` of
                spatial weights (typically row-standardized).
            values: Mapping ``{unit_id: numeric_value}`` for the
                variable to spatially lag.
        """
        self._weights = weights
        self._values = values

    def compute(self, context: FactorContext) -> float:
        """Return the spatial lag for the context's geographic unit.

        Returns:
            The weighted sum of neighboring values.  Returns ``0.0``
            when the unit has no neighbors in the weights dict.
        """
        unit = context.geography_value
        nbrs = self._weights.get(unit, {})
        if not nbrs:
            return 0.0
        return sum(w * self._values.get(nb, 0.0) for nb, w in nbrs.items())

name class-attribute instance-attribute

name = 'spatial_lag'

dtype class-attribute instance-attribute

dtype = 'float'

compute

compute(context: FactorContext) -> float

Return the spatial lag for the context's geographic unit.

Returns:

Type Description
float

The weighted sum of neighboring values. Returns 0.0

float

when the unit has no neighbors in the weights dict.

Source code in src/nyc311/factors/_advanced.py
37
38
39
40
41
42
43
44
45
46
47
48
def compute(self, context: FactorContext) -> float:
    """Return the spatial lag for the context's geographic unit.

    Returns:
        The weighted sum of neighboring values.  Returns ``0.0``
        when the unit has no neighbors in the weights dict.
    """
    unit = context.geography_value
    nbrs = self._weights.get(unit, {})
    if not nbrs:
        return 0.0
    return sum(w * self._values.get(nb, 0.0) for nb, w in nbrs.items())

Factor

Bases: ABC

Abstract base for a single named computation over a FactorContext.

Source code in src/nyc311/factors/_base.py
44
45
46
47
48
49
50
51
52
class Factor(ABC):
    """Abstract base for a single named computation over a FactorContext."""

    name: str
    dtype: Literal["float", "str", "bool", "int"]

    @abstractmethod
    def compute(self, context: FactorContext) -> float | str | bool | int:
        """Return the computed value for *context*."""

name instance-attribute

name: str

dtype instance-attribute

dtype: Literal['float', 'str', 'bool', 'int']

compute abstractmethod

compute(context: FactorContext) -> float | str | bool | int

Return the computed value for context.

Source code in src/nyc311/factors/_base.py
50
51
52
@abstractmethod
def compute(self, context: FactorContext) -> float | str | bool | int:
    """Return the computed value for *context*."""

FactorContext dataclass

Row-level context for factor computation.

Each context represents one geographic unit (community district, NTA, borough) over one time window. Factors compute a single value from this context.

Source code in src/nyc311/factors/_base.py
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
@dataclass(frozen=True, slots=True)
class FactorContext:
    """Row-level context for factor computation.

    Each context represents one geographic unit (community district, NTA,
    borough) over one time window.  Factors compute a single value from
    this context.
    """

    geography: str
    geography_value: str
    complaints: tuple[ServiceRequestRecord, ...]
    time_window_start: date
    time_window_end: date
    total_population: int | None = None
    extras: dict[str, Any] | None = None

geography instance-attribute

geography: str

geography_value instance-attribute

geography_value: str

complaints instance-attribute

complaints: tuple[ServiceRequestRecord, ...]

time_window_start instance-attribute

time_window_start: date

time_window_end instance-attribute

time_window_end: date

total_population class-attribute instance-attribute

total_population: int | None = None

extras class-attribute instance-attribute

extras: dict[str, Any] | None = None

Pipeline

Immutable builder that executes factors over contexts.

Pipeline never mutates in place: :meth:add returns a new pipeline with the factor appended.

Source code in src/nyc311/factors/_base.py
 55
 56
 57
 58
 59
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
class Pipeline:
    """Immutable builder that executes factors over contexts.

    ``Pipeline`` never mutates in place: :meth:`add` returns a **new**
    pipeline with the factor appended.
    """

    __slots__ = ("_factors",)

    def __init__(self, factors: tuple[Factor, ...] = ()) -> None:
        self._factors = factors

    def add(self, factor: Factor) -> Pipeline:
        """Return a new pipeline with ``factor`` appended.

        Args:
            factor: The factor to append. Must define a unique ``name``.

        Returns:
            A new :class:`Pipeline` whose ``factors`` tuple ends with
            ``factor``. The receiver is left unmodified.
        """
        return Pipeline((*self._factors, factor))

    @property
    def factors(self) -> tuple[Factor, ...]:
        """The ordered factors in this pipeline."""
        return self._factors

    def as_factor_factory_estimate(
        self,
        panel: Any,
        *,
        family: str = "did",
        method: str = "twfe",
        outcome: str | None = None,
        **engine_kwargs: Any,
    ) -> Any:
        """Run a factor-factory engine on ``panel`` as a Pipeline continuation.

        Additive bridge: the pipeline itself is not executed here.
        Instead, the call dispatches into
        ``factor_factory.engines.<family>.estimate``, returning a
        factor-factory ``<Family>Results`` object that downstream code
        can chain off.

        Args:
            panel: A :class:`factor_factory.tidy.Panel`. Typically
                produced by
                :meth:`nyc311.temporal.PanelDataset.to_factor_factory_panel`.
            family: Engine-family module name under
                ``factor_factory.engines``. Defaults to ``"did"``.
            method: Registry key for a specific adapter inside the
                family (e.g. ``"twfe"``, ``"cs"``). Defaults to
                ``"twfe"``.
            outcome: Outcome column on the Panel. When ``None``, the
                engine falls back to ``panel.outcome_col``.
            **engine_kwargs: Additional kwargs forwarded to the engine's
                ``estimate`` dispatcher.

        Returns:
            A factor-factory ``<Family>Results`` object.

        Raises:
            ImportError: If factor-factory is not installed or the
                requested engine family's optional dependencies are
                missing.
        """
        from nyc311.factors._factor_factory import dispatch_factor_factory_engine

        return dispatch_factor_factory_engine(
            panel,
            family=family,
            method=method,
            outcome=outcome,
            **engine_kwargs,
        )

    def run(self, contexts: Iterable[FactorContext]) -> PipelineResult:
        """Execute all factors across ``contexts`` and return results.

        Iterates over each context once and evaluates every factor against
        it, producing a columnar :class:`PipelineResult` keyed by factor
        name.

        Args:
            contexts: An iterable of :class:`FactorContext` instances. Each
                context corresponds to one geographic-unit / time-window
                row in the final result.

        Returns:
            A :class:`PipelineResult` whose ``columns`` map factor names to
            value tuples and whose ``geography_ids`` tuple aligns with
            those columns positionally.
        """
        context_list = list(contexts)
        geography_ids: list[str] = []
        columns: dict[str, list[Any]] = {f.name: [] for f in self._factors}

        for ctx in context_list:
            geography_ids.append(ctx.geography_value)
            for factor in self._factors:
                columns[factor.name].append(factor.compute(ctx))

        return PipelineResult(
            columns={name: tuple(values) for name, values in columns.items()},
            geography_ids=tuple(geography_ids),
        )

factors property

factors: tuple[Factor, ...]

The ordered factors in this pipeline.

add

add(factor: Factor) -> Pipeline

Return a new pipeline with factor appended.

Parameters:

Name Type Description Default
factor Factor

The factor to append. Must define a unique name.

required

Returns:

Type Description
Pipeline

A new :class:Pipeline whose factors tuple ends with

Pipeline

factor. The receiver is left unmodified.

Source code in src/nyc311/factors/_base.py
67
68
69
70
71
72
73
74
75
76
77
def add(self, factor: Factor) -> Pipeline:
    """Return a new pipeline with ``factor`` appended.

    Args:
        factor: The factor to append. Must define a unique ``name``.

    Returns:
        A new :class:`Pipeline` whose ``factors`` tuple ends with
        ``factor``. The receiver is left unmodified.
    """
    return Pipeline((*self._factors, factor))

as_factor_factory_estimate

as_factor_factory_estimate(
    panel: Any,
    *,
    family: str = "did",
    method: str = "twfe",
    outcome: str | None = None,
    **engine_kwargs: Any,
) -> Any

Run a factor-factory engine on panel as a Pipeline continuation.

Additive bridge: the pipeline itself is not executed here. Instead, the call dispatches into factor_factory.engines.<family>.estimate, returning a factor-factory <Family>Results object that downstream code can chain off.

Parameters:

Name Type Description Default
panel Any

A :class:factor_factory.tidy.Panel. Typically produced by :meth:nyc311.temporal.PanelDataset.to_factor_factory_panel.

required
family str

Engine-family module name under factor_factory.engines. Defaults to "did".

'did'
method str

Registry key for a specific adapter inside the family (e.g. "twfe", "cs"). Defaults to "twfe".

'twfe'
outcome str | None

Outcome column on the Panel. When None, the engine falls back to panel.outcome_col.

None
**engine_kwargs Any

Additional kwargs forwarded to the engine's estimate dispatcher.

{}

Returns:

Type Description
Any

A factor-factory <Family>Results object.

Raises:

Type Description
ImportError

If factor-factory is not installed or the requested engine family's optional dependencies are missing.

Source code in src/nyc311/factors/_base.py
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
def as_factor_factory_estimate(
    self,
    panel: Any,
    *,
    family: str = "did",
    method: str = "twfe",
    outcome: str | None = None,
    **engine_kwargs: Any,
) -> Any:
    """Run a factor-factory engine on ``panel`` as a Pipeline continuation.

    Additive bridge: the pipeline itself is not executed here.
    Instead, the call dispatches into
    ``factor_factory.engines.<family>.estimate``, returning a
    factor-factory ``<Family>Results`` object that downstream code
    can chain off.

    Args:
        panel: A :class:`factor_factory.tidy.Panel`. Typically
            produced by
            :meth:`nyc311.temporal.PanelDataset.to_factor_factory_panel`.
        family: Engine-family module name under
            ``factor_factory.engines``. Defaults to ``"did"``.
        method: Registry key for a specific adapter inside the
            family (e.g. ``"twfe"``, ``"cs"``). Defaults to
            ``"twfe"``.
        outcome: Outcome column on the Panel. When ``None``, the
            engine falls back to ``panel.outcome_col``.
        **engine_kwargs: Additional kwargs forwarded to the engine's
            ``estimate`` dispatcher.

    Returns:
        A factor-factory ``<Family>Results`` object.

    Raises:
        ImportError: If factor-factory is not installed or the
            requested engine family's optional dependencies are
            missing.
    """
    from nyc311.factors._factor_factory import dispatch_factor_factory_engine

    return dispatch_factor_factory_engine(
        panel,
        family=family,
        method=method,
        outcome=outcome,
        **engine_kwargs,
    )

run

run(contexts: Iterable[FactorContext]) -> PipelineResult

Execute all factors across contexts and return results.

Iterates over each context once and evaluates every factor against it, producing a columnar :class:PipelineResult keyed by factor name.

Parameters:

Name Type Description Default
contexts Iterable[FactorContext]

An iterable of :class:FactorContext instances. Each context corresponds to one geographic-unit / time-window row in the final result.

required

Returns:

Name Type Description
A PipelineResult

class:PipelineResult whose columns map factor names to

PipelineResult

value tuples and whose geography_ids tuple aligns with

PipelineResult

those columns positionally.

Source code in src/nyc311/factors/_base.py
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
def run(self, contexts: Iterable[FactorContext]) -> PipelineResult:
    """Execute all factors across ``contexts`` and return results.

    Iterates over each context once and evaluates every factor against
    it, producing a columnar :class:`PipelineResult` keyed by factor
    name.

    Args:
        contexts: An iterable of :class:`FactorContext` instances. Each
            context corresponds to one geographic-unit / time-window
            row in the final result.

    Returns:
        A :class:`PipelineResult` whose ``columns`` map factor names to
        value tuples and whose ``geography_ids`` tuple aligns with
        those columns positionally.
    """
    context_list = list(contexts)
    geography_ids: list[str] = []
    columns: dict[str, list[Any]] = {f.name: [] for f in self._factors}

    for ctx in context_list:
        geography_ids.append(ctx.geography_value)
        for factor in self._factors:
            columns[factor.name].append(factor.compute(ctx))

    return PipelineResult(
        columns={name: tuple(values) for name, values in columns.items()},
        geography_ids=tuple(geography_ids),
    )

PipelineResult dataclass

Columnar result set produced by :meth:Pipeline.run.

Source code in src/nyc311/factors/_base.py
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
@dataclass(frozen=True, slots=True)
class PipelineResult:
    """Columnar result set produced by :meth:`Pipeline.run`."""

    columns: dict[str, tuple[Any, ...]]
    geography_ids: tuple[str, ...]

    def to_records(self) -> tuple[dict[str, Any], ...]:
        """Convert to a tuple of row dictionaries.

        Returns:
            A tuple where each element is a dict containing
            ``geography_id`` plus one key per factor in the pipeline. The
            row order matches :attr:`geography_ids`.
        """
        records: list[dict[str, Any]] = []
        for i, geography_id in enumerate(self.geography_ids):
            row: dict[str, Any] = {"geography_id": geography_id}
            for col_name, values in self.columns.items():
                row[col_name] = values[i]
            records.append(row)
        return tuple(records)

    def to_dataframe(self) -> Any:
        """Convert to a pandas DataFrame indexed by ``geography_id``.

        Returns:
            A ``pandas.DataFrame`` with one row per geographic unit and
            one column per factor, indexed by ``geography_id``.

        Raises:
            ImportError: If pandas is not installed. Install the optional
                dataframes extra with ``pip install nyc311[dataframes]``.
        """
        try:
            import pandas as pd
        except ImportError as exc:
            message = (
                "pandas is required for to_dataframe(). "
                "Install it with: pip install nyc311[dataframes]"
            )
            raise ImportError(message) from exc

        data: dict[str, Any] = {"geography_id": self.geography_ids, **self.columns}
        return pd.DataFrame(data).set_index("geography_id")

columns instance-attribute

columns: dict[str, tuple[Any, ...]]

geography_ids instance-attribute

geography_ids: tuple[str, ...]

to_records

to_records() -> tuple[dict[str, Any], ...]

Convert to a tuple of row dictionaries.

Returns:

Type Description
dict[str, Any]

A tuple where each element is a dict containing

...

geography_id plus one key per factor in the pipeline. The

tuple[dict[str, Any], ...]

row order matches :attr:geography_ids.

Source code in src/nyc311/factors/_base.py
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
def to_records(self) -> tuple[dict[str, Any], ...]:
    """Convert to a tuple of row dictionaries.

    Returns:
        A tuple where each element is a dict containing
        ``geography_id`` plus one key per factor in the pipeline. The
        row order matches :attr:`geography_ids`.
    """
    records: list[dict[str, Any]] = []
    for i, geography_id in enumerate(self.geography_ids):
        row: dict[str, Any] = {"geography_id": geography_id}
        for col_name, values in self.columns.items():
            row[col_name] = values[i]
        records.append(row)
    return tuple(records)

to_dataframe

to_dataframe() -> Any

Convert to a pandas DataFrame indexed by geography_id.

Returns:

Type Description
Any

A pandas.DataFrame with one row per geographic unit and

Any

one column per factor, indexed by geography_id.

Raises:

Type Description
ImportError

If pandas is not installed. Install the optional dataframes extra with pip install nyc311[dataframes].

Source code in src/nyc311/factors/_base.py
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
def to_dataframe(self) -> Any:
    """Convert to a pandas DataFrame indexed by ``geography_id``.

    Returns:
        A ``pandas.DataFrame`` with one row per geographic unit and
        one column per factor, indexed by ``geography_id``.

    Raises:
        ImportError: If pandas is not installed. Install the optional
            dataframes extra with ``pip install nyc311[dataframes]``.
    """
    try:
        import pandas as pd
    except ImportError as exc:
        message = (
            "pandas is required for to_dataframe(). "
            "Install it with: pip install nyc311[dataframes]"
        )
        raise ImportError(message) from exc

    data: dict[str, Any] = {"geography_id": self.geography_ids, **self.columns}
    return pd.DataFrame(data).set_index("geography_id")

AnomalyScoreFactor

Bases: Factor

Z-score of this unit's complaint volume.

Because the z-score is relative to the full set of contexts in the pipeline run, this factor stores intermediate counts and finalizes during :meth:Pipeline.run. As a stateless compromise it uses a fixed population_mean and population_std provided at construction time.

Returns 0.0 when population_std is zero.

Source code in src/nyc311/factors/_builtin.py
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
class AnomalyScoreFactor(Factor):
    """Z-score of this unit's complaint volume.

    Because the z-score is relative to the **full set of contexts** in
    the pipeline run, this factor stores intermediate counts and
    finalizes during :meth:`Pipeline.run`.  As a stateless compromise
    it uses a fixed *population_mean* and *population_std* provided at
    construction time.

    Returns ``0.0`` when *population_std* is zero.
    """

    name = "anomaly_score"
    dtype = "float"

    def __init__(
        self,
        *,
        population_mean: float,
        population_std: float,
    ) -> None:
        """Initialize the factor.

        Args:
            population_mean: Mean complaint count to compare each context
                against. Should be precomputed across the full set of
                contexts the pipeline will see.
            population_std: Population standard deviation of complaint
                counts. A value of ``0`` causes :meth:`compute` to return
                ``0.0`` for every context (z-score is undefined).
        """
        self._mean = population_mean
        self._std = population_std

    def compute(self, context: FactorContext) -> float:
        """Return the z-score of this context's complaint volume.

        Returns:
            ``(count - population_mean) / population_std``, or ``0.0``
            when ``population_std`` is zero.
        """
        if self._std == 0:
            return 0.0
        return (len(context.complaints) - self._mean) / self._std

name class-attribute instance-attribute

name = 'anomaly_score'

dtype class-attribute instance-attribute

dtype = 'float'

compute

compute(context: FactorContext) -> float

Return the z-score of this context's complaint volume.

Returns:

Type Description
float

(count - population_mean) / population_std, or 0.0

float

when population_std is zero.

Source code in src/nyc311/factors/_builtin.py
206
207
208
209
210
211
212
213
214
215
def compute(self, context: FactorContext) -> float:
    """Return the z-score of this context's complaint volume.

    Returns:
        ``(count - population_mean) / population_std``, or ``0.0``
        when ``population_std`` is zero.
    """
    if self._std == 0:
        return 0.0
    return (len(context.complaints) - self._mean) / self._std

ComplaintVolumeFactor

Bases: Factor

Total complaint count, optionally per-capita per 10 000 residents.

When per_capita is True and :attr:FactorContext.total_population is available, the result is count / population * 10_000 (a float). Otherwise the raw integer count is returned.

Source code in src/nyc311/factors/_builtin.py
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
class ComplaintVolumeFactor(Factor):
    """Total complaint count, optionally per-capita per 10 000 residents.

    When *per_capita* is ``True`` and :attr:`FactorContext.total_population`
    is available, the result is ``count / population * 10_000`` (a float).
    Otherwise the raw integer count is returned.
    """

    dtype = "int"

    def __init__(self, *, per_capita: bool = False) -> None:
        """Initialize the factor.

        Args:
            per_capita: If ``True``, normalize by
                :attr:`FactorContext.total_population` and emit a
                ``complaint_rate_per_10k`` float. Otherwise emit the raw
                ``complaint_volume`` integer count.
        """
        self._per_capita = per_capita
        self.name = "complaint_rate_per_10k" if per_capita else "complaint_volume"
        if per_capita:
            self.dtype = "float"  # type: ignore[assignment]

    def compute(self, context: FactorContext) -> int | float:
        """Return the complaint volume (or per-capita rate) for ``context``.

        Returns:
            The integer count of complaints in the context, or, when
            ``per_capita`` is enabled and population is available, the
            float ``count / population * 10_000``.
        """
        count = len(context.complaints)
        if (
            self._per_capita
            and context.total_population
            and context.total_population > 0
        ):
            return count / context.total_population * 10_000
        return count

dtype class-attribute instance-attribute

dtype = 'int'

name instance-attribute

name = (
    "complaint_rate_per_10k"
    if per_capita
    else "complaint_volume"
)

compute

compute(context: FactorContext) -> int | float

Return the complaint volume (or per-capita rate) for context.

Returns:

Type Description
int | float

The integer count of complaints in the context, or, when

int | float

per_capita is enabled and population is available, the

int | float

float count / population * 10_000.

Source code in src/nyc311/factors/_builtin.py
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
def compute(self, context: FactorContext) -> int | float:
    """Return the complaint volume (or per-capita rate) for ``context``.

    Returns:
        The integer count of complaints in the context, or, when
        ``per_capita`` is enabled and population is available, the
        float ``count / population * 10_000``.
    """
    count = len(context.complaints)
    if (
        self._per_capita
        and context.total_population
        and context.total_population > 0
    ):
        return count / context.total_population * 10_000
    return count

RecurrenceFactor

Bases: Factor

Fraction of complaints at locations that appear more than once.

Locations are identified by rounding latitude/longitude to 4 decimal places (~11 m precision). Returns 0.0 when no complaints have coordinates.

Source code in src/nyc311/factors/_builtin.py
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
class RecurrenceFactor(Factor):
    """Fraction of complaints at locations that appear more than once.

    Locations are identified by rounding latitude/longitude to 4 decimal
    places (~11 m precision).  Returns ``0.0`` when no complaints have
    coordinates.
    """

    name = "recurrence_rate"
    dtype = "float"

    def compute(self, context: FactorContext) -> float:
        """Return the recurrent-location share for ``context``.

        Returns:
            The fraction of geocoded complaint locations (latitude and
            longitude rounded to 4 decimal places) that appear more than
            once in the context. Returns ``0.0`` when no complaints have
            coordinates.
        """
        geo_complaints = [
            c
            for c in context.complaints
            if c.latitude is not None and c.longitude is not None
        ]
        if not geo_complaints:
            return 0.0

        location_counts = Counter(
            (round(c.latitude, 4), round(c.longitude, 4))  # type: ignore[arg-type]
            for c in geo_complaints
        )
        recurrent = sum(1 for c in location_counts.values() if c > 1)
        return recurrent / len(location_counts) if location_counts else 0.0

name class-attribute instance-attribute

name = 'recurrence_rate'

dtype class-attribute instance-attribute

dtype = 'float'

compute

compute(context: FactorContext) -> float

Return the recurrent-location share for context.

Returns:

Type Description
float

The fraction of geocoded complaint locations (latitude and

float

longitude rounded to 4 decimal places) that appear more than

float

once in the context. Returns 0.0 when no complaints have

float

coordinates.

Source code in src/nyc311/factors/_builtin.py
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
def compute(self, context: FactorContext) -> float:
    """Return the recurrent-location share for ``context``.

    Returns:
        The fraction of geocoded complaint locations (latitude and
        longitude rounded to 4 decimal places) that appear more than
        once in the context. Returns ``0.0`` when no complaints have
        coordinates.
    """
    geo_complaints = [
        c
        for c in context.complaints
        if c.latitude is not None and c.longitude is not None
    ]
    if not geo_complaints:
        return 0.0

    location_counts = Counter(
        (round(c.latitude, 4), round(c.longitude, 4))  # type: ignore[arg-type]
        for c in geo_complaints
    )
    recurrent = sum(1 for c in location_counts.values() if c > 1)
    return recurrent / len(location_counts) if location_counts else 0.0

ResolutionTimeFactor

Bases: Factor

Median or mean days between complaint creation and resolution.

Uses resolution_description is not None as a proxy for resolved. Returns -1.0 when no resolved complaints exist in the context.

Source code in src/nyc311/factors/_builtin.py
 53
 54
 55
 56
 57
 58
 59
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
class ResolutionTimeFactor(Factor):
    """Median or mean days between complaint creation and resolution.

    Uses ``resolution_description is not None`` as a proxy for resolved.
    Returns ``-1.0`` when no resolved complaints exist in the context.
    """

    name = "resolution_time_days"
    dtype = "float"

    def __init__(self, *, method: str = "median") -> None:
        """Initialize the factor.

        Args:
            method: Aggregation strategy across resolved complaints; one
                of ``"median"`` (default) or ``"mean"``.

        Raises:
            ValueError: If ``method`` is not ``"median"`` or ``"mean"``.
        """
        if method not in ("median", "mean"):
            msg = f"method must be 'median' or 'mean', got {method!r}"
            raise ValueError(msg)
        self._method = method

    def compute(self, context: FactorContext) -> float:
        """Return the median (or mean) resolution time for ``context``.

        Returns:
            The number of days between complaint creation and the
            window's end across resolved complaints, aggregated by the
            configured ``method``. Returns ``-1.0`` when no complaints in
            the context have a resolution description.
        """
        resolved = [
            c for c in context.complaints if c.resolution_description is not None
        ]
        if not resolved:
            return -1.0

        days: list[float] = []
        for c in resolved:
            delta = context.time_window_end - c.created_date
            days.append(max(float(delta.days), 0.0))

        if not days:
            return -1.0
        return median(days) if self._method == "median" else mean(days)

name class-attribute instance-attribute

name = 'resolution_time_days'

dtype class-attribute instance-attribute

dtype = 'float'

compute

compute(context: FactorContext) -> float

Return the median (or mean) resolution time for context.

Returns:

Type Description
float

The number of days between complaint creation and the

float

window's end across resolved complaints, aggregated by the

float

configured method. Returns -1.0 when no complaints in

float

the context have a resolution description.

Source code in src/nyc311/factors/_builtin.py
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
def compute(self, context: FactorContext) -> float:
    """Return the median (or mean) resolution time for ``context``.

    Returns:
        The number of days between complaint creation and the
        window's end across resolved complaints, aggregated by the
        configured ``method``. Returns ``-1.0`` when no complaints in
        the context have a resolution description.
    """
    resolved = [
        c for c in context.complaints if c.resolution_description is not None
    ]
    if not resolved:
        return -1.0

    days: list[float] = []
    for c in resolved:
        delta = context.time_window_end - c.created_date
        days.append(max(float(delta.days), 0.0))

    if not days:
        return -1.0
    return median(days) if self._method == "median" else mean(days)

ResponseRateFactor

Bases: Factor

Fraction of complaints that received a resolution description.

Range [0.0, 1.0]. Returns 0.0 for empty contexts.

Source code in src/nyc311/factors/_builtin.py
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
class ResponseRateFactor(Factor):
    """Fraction of complaints that received a resolution description.

    Range [0.0, 1.0].  Returns ``0.0`` for empty contexts.
    """

    name = "response_rate"
    dtype = "float"

    def compute(self, context: FactorContext) -> float:
        """Return the resolved fraction of complaints in ``context``.

        Returns:
            The fraction of complaints with a non-null
            ``resolution_description``, in ``[0.0, 1.0]``. Returns
            ``0.0`` for empty contexts.
        """
        if not context.complaints:
            return 0.0
        resolved = sum(
            1 for c in context.complaints if c.resolution_description is not None
        )
        return resolved / len(context.complaints)

name class-attribute instance-attribute

name = 'response_rate'

dtype class-attribute instance-attribute

dtype = 'float'

compute

compute(context: FactorContext) -> float

Return the resolved fraction of complaints in context.

Returns:

Type Description
float

The fraction of complaints with a non-null

float

resolution_description, in [0.0, 1.0]. Returns

float

0.0 for empty contexts.

Source code in src/nyc311/factors/_builtin.py
227
228
229
230
231
232
233
234
235
236
237
238
239
240
def compute(self, context: FactorContext) -> float:
    """Return the resolved fraction of complaints in ``context``.

    Returns:
        The fraction of complaints with a non-null
        ``resolution_description``, in ``[0.0, 1.0]``. Returns
        ``0.0`` for empty contexts.
    """
    if not context.complaints:
        return 0.0
    resolved = sum(
        1 for c in context.complaints if c.resolution_description is not None
    )
    return resolved / len(context.complaints)

SeasonalityFactor

Bases: Factor

Deviation of complaint count from a seasonal baseline.

baseline_monthly_counts maps month number (1-12) to the expected count for that month. The factor returns (actual - expected) / expected as a fractional deviation. Returns 0.0 when the baseline is missing for the context's month or is zero.

Source code in src/nyc311/factors/_builtin.py
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
class SeasonalityFactor(Factor):
    """Deviation of complaint count from a seasonal baseline.

    *baseline_monthly_counts* maps month number (1-12) to the expected
    count for that month.  The factor returns ``(actual - expected) /
    expected`` as a fractional deviation.  Returns ``0.0`` when the
    baseline is missing for the context's month or is zero.
    """

    name = "seasonality_deviation"
    dtype = "float"

    def __init__(self, baseline_monthly_counts: dict[int, float]) -> None:
        """Initialize the factor.

        Args:
            baseline_monthly_counts: Mapping from month number (``1``
                through ``12``) to the expected complaint count for that
                month. Months not present in the mapping are treated as
                having no baseline.
        """
        self._baseline = baseline_monthly_counts

    def compute(self, context: FactorContext) -> float:
        """Return the fractional deviation from the seasonal baseline.

        Returns:
            ``(actual - expected) / expected`` where ``actual`` is the
            number of complaints in the context and ``expected`` is the
            baseline for the context's start-month. Returns ``0.0`` when
            the baseline is missing or non-positive for that month.
        """
        month = context.time_window_start.month
        expected = self._baseline.get(month, 0.0)
        if expected <= 0:
            return 0.0
        actual = len(context.complaints)
        return (actual - expected) / expected

name class-attribute instance-attribute

name = 'seasonality_deviation'

dtype class-attribute instance-attribute

dtype = 'float'

compute

compute(context: FactorContext) -> float

Return the fractional deviation from the seasonal baseline.

Returns:

Type Description
float

(actual - expected) / expected where actual is the

float

number of complaints in the context and expected is the

float

baseline for the context's start-month. Returns 0.0 when

float

the baseline is missing or non-positive for that month.

Source code in src/nyc311/factors/_builtin.py
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
def compute(self, context: FactorContext) -> float:
    """Return the fractional deviation from the seasonal baseline.

    Returns:
        ``(actual - expected) / expected`` where ``actual`` is the
        number of complaints in the context and ``expected`` is the
        baseline for the context's start-month. Returns ``0.0`` when
        the baseline is missing or non-positive for that month.
    """
    month = context.time_window_start.month
    expected = self._baseline.get(month, 0.0)
    if expected <= 0:
        return 0.0
    actual = len(context.complaints)
    return (actual - expected) / expected

TopicConcentrationFactor

Bases: Factor

Herfindahl-Hirschman Index of complaint-type shares.

HHI = sum(share_i^2) where share_i is the proportion of complaints of type i. Range [1/N, 1.0]; higher values indicate more concentration in fewer complaint types.

Returns 0.0 when the context has no complaints.

Source code in src/nyc311/factors/_builtin.py
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
class TopicConcentrationFactor(Factor):
    """Herfindahl-Hirschman Index of complaint-type shares.

    HHI = sum(share_i^2) where share_i is the proportion of complaints of
    type *i*.  Range [1/N, 1.0]; higher values indicate more concentration
    in fewer complaint types.

    Returns ``0.0`` when the context has no complaints.
    """

    name = "topic_concentration"
    dtype = "float"

    def compute(self, context: FactorContext) -> float:
        """Return the HHI of complaint-type shares for ``context``.

        Returns:
            ``sum(share_i ** 2)`` where each ``share_i`` is the proportion
            of complaints of type ``i``. The value lies in ``[1/N, 1.0]``
            and increases as complaints concentrate in fewer types.
            Returns ``0.0`` when the context has no complaints.
        """
        if not context.complaints:
            return 0.0
        counts = Counter(c.complaint_type for c in context.complaints)
        total = len(context.complaints)
        return sum((count / total) ** 2 for count in counts.values())

name class-attribute instance-attribute

name = 'topic_concentration'

dtype class-attribute instance-attribute

dtype = 'float'

compute

compute(context: FactorContext) -> float

Return the HHI of complaint-type shares for context.

Returns:

Type Description
float

sum(share_i ** 2) where each share_i is the proportion

float

of complaints of type i. The value lies in [1/N, 1.0]

float

and increases as complaints concentrate in fewer types.

float

Returns 0.0 when the context has no complaints.

Source code in src/nyc311/factors/_builtin.py
116
117
118
119
120
121
122
123
124
125
126
127
128
129
def compute(self, context: FactorContext) -> float:
    """Return the HHI of complaint-type shares for ``context``.

    Returns:
        ``sum(share_i ** 2)`` where each ``share_i`` is the proportion
        of complaints of type ``i``. The value lies in ``[1/N, 1.0]``
        and increases as complaints concentrate in fewer types.
        Returns ``0.0`` when the context has no complaints.
    """
    if not context.complaints:
        return 0.0
    counts = Counter(c.complaint_type for c in context.complaints)
    total = len(context.complaints)
    return sum((count / total) ** 2 for count in counts.values())

dispatch_factor_factory_engine

dispatch_factor_factory_engine(
    panel: Panel,
    *,
    family: str = "did",
    method: str = "twfe",
    outcome: str | None = None,
    **engine_kwargs: Any,
) -> Any

Call factor_factory.engines.<family>.estimate on panel.

This is the chaining target behind :meth:nyc311.factors.Pipeline.as_factor_factory_estimate. It lazily imports the requested engine family so callers don't pay the import cost for families they don't use, and it raises a friendly :class:ImportError when the family's optional dependencies are missing.

Parameters:

Name Type Description Default
panel Panel

A :class:factor_factory.tidy.Panel. Typically produced by :meth:nyc311.temporal.PanelDataset.to_factor_factory_panel.

required
family str

Engine-family module name under factor_factory.engines. One of :data:_SUPPORTED_FAMILIES.

'did'
method str

Registry key for a specific adapter inside the family. For example, "twfe" / "cs" / "sa" / "bjs" for family="did".

'twfe'
outcome str | None

Outcome column on the Panel. When None, the engine falls back to panel.outcome_col (the primary outcome declared in :class:PanelMetadata).

None
**engine_kwargs Any

Additional keyword arguments forwarded to the engine's estimate dispatcher.

{}

Returns:

Type Description
Any

The factor-factory <Family>Results object the engine

Any

returned. Its :meth:summary_table method produces a

Any

pandas.DataFrame summary.

Raises:

Type Description
ValueError

If family is not in :data:_SUPPORTED_FAMILIES.

ImportError

If factor-factory is not installed or the requested engine family's optional dependencies are missing.

Source code in src/nyc311/factors/_factor_factory.py
 40
 41
 42
 43
 44
 45
 46
 47
 48
 49
 50
 51
 52
 53
 54
 55
 56
 57
 58
 59
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
def dispatch_factor_factory_engine(
    panel: ff_tidy.Panel,
    *,
    family: str = "did",
    method: str = "twfe",
    outcome: str | None = None,
    **engine_kwargs: Any,
) -> Any:
    """Call ``factor_factory.engines.<family>.estimate`` on ``panel``.

    This is the chaining target behind
    :meth:`nyc311.factors.Pipeline.as_factor_factory_estimate`. It
    lazily imports the requested engine family so callers don't pay the
    import cost for families they don't use, and it raises a friendly
    :class:`ImportError` when the family's optional dependencies are
    missing.

    Args:
        panel: A :class:`factor_factory.tidy.Panel`. Typically produced
            by :meth:`nyc311.temporal.PanelDataset.to_factor_factory_panel`.
        family: Engine-family module name under
            ``factor_factory.engines``. One of :data:`_SUPPORTED_FAMILIES`.
        method: Registry key for a specific adapter inside the family.
            For example, ``"twfe"`` / ``"cs"`` / ``"sa"`` / ``"bjs"`` for
            ``family="did"``.
        outcome: Outcome column on the Panel. When ``None``, the engine
            falls back to ``panel.outcome_col`` (the primary outcome
            declared in :class:`PanelMetadata`).
        **engine_kwargs: Additional keyword arguments forwarded to the
            engine's ``estimate`` dispatcher.

    Returns:
        The factor-factory ``<Family>Results`` object the engine
        returned. Its :meth:`summary_table` method produces a
        ``pandas.DataFrame`` summary.

    Raises:
        ValueError: If ``family`` is not in :data:`_SUPPORTED_FAMILIES`.
        ImportError: If factor-factory is not installed or the requested
            engine family's optional dependencies are missing.
    """
    if family not in _SUPPORTED_FAMILIES:
        message = (
            f"Unknown factor-factory engine family {family!r}. "
            f"Supported: {_SUPPORTED_FAMILIES}"
        )
        raise ValueError(message)

    module_name = f"factor_factory.engines.{family}"
    try:
        module = importlib.import_module(module_name)
    except ImportError as exc:
        message = (
            f"Could not import {module_name}. Install factor-factory "
            f"with: pip install nyc311 (or pip install factor-factory)."
        )
        raise ImportError(message) from exc

    estimate = module.estimate
    return estimate(
        panel,
        methods=(method,),
        outcome=outcome,
        **engine_kwargs,
    )

Temporal

nyc311.temporal

Temporal panel module for longitudinal 311 complaint analysis.

PanelDataset dataclass

Balanced panel of (geographic_unit x time_period) observations.

Methods return new :class:PanelDataset instances—the dataset is never mutated in place.

Source code in src/nyc311/temporal/_models.py
 55
 56
 57
 58
 59
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
@dataclass(frozen=True, slots=True)
class PanelDataset:
    """Balanced panel of (geographic_unit x time_period) observations.

    Methods return **new** :class:`PanelDataset` instances—the dataset is
    never mutated in place.
    """

    observations: tuple[PanelObservation, ...]
    unit_type: str
    periods: tuple[str, ...]
    treatment_events: tuple[TreatmentEvent, ...] = ()

    # ------------------------------------------------------------------
    # Filtering helpers
    # ------------------------------------------------------------------

    def treatment_group(self) -> PanelDataset:
        """Return only observations in units that were ever treated.

        Returns:
            A new :class:`PanelDataset` whose ``observations`` are
            restricted to units with a non-null ``treatment_date``. The
            ``periods`` and ``treatment_events`` fields are preserved.
        """
        treated_ids = {
            obs.unit_id for obs in self.observations if obs.treatment_date is not None
        }
        return PanelDataset(
            observations=tuple(
                o for o in self.observations if o.unit_id in treated_ids
            ),
            unit_type=self.unit_type,
            periods=self.periods,
            treatment_events=self.treatment_events,
        )

    def control_group(self) -> PanelDataset:
        """Return only observations in units that were never treated.

        Returns:
            A new :class:`PanelDataset` whose ``observations`` are
            restricted to units with no ``treatment_date``. The
            ``periods`` and ``treatment_events`` fields are preserved.
        """
        treated_ids = {
            obs.unit_id for obs in self.observations if obs.treatment_date is not None
        }
        return PanelDataset(
            observations=tuple(
                o for o in self.observations if o.unit_id not in treated_ids
            ),
            unit_type=self.unit_type,
            periods=self.periods,
            treatment_events=self.treatment_events,
        )

    def filter_periods(self, start: str, end: str) -> PanelDataset:
        """Restrict the dataset to a closed interval of periods.

        Args:
            start: Inclusive lower-bound period label.
            end: Inclusive upper-bound period label.

        Returns:
            A new :class:`PanelDataset` whose ``observations`` and
            ``periods`` are limited to labels ``p`` satisfying
            ``start <= p <= end``.
        """
        filtered_periods = tuple(p for p in self.periods if start <= p <= end)
        return PanelDataset(
            observations=tuple(
                o for o in self.observations if start <= o.period <= end
            ),
            unit_type=self.unit_type,
            periods=filtered_periods,
            treatment_events=self.treatment_events,
        )

    # ------------------------------------------------------------------
    # Properties
    # ------------------------------------------------------------------

    @property
    def unit_ids(self) -> tuple[str, ...]:
        """The sorted, unique unit identifiers in the dataset.

        Returns:
            A tuple of distinct ``unit_id`` values from
            ``observations``, in lexicographic order.
        """
        return tuple(sorted({obs.unit_id for obs in self.observations}))

    # ------------------------------------------------------------------
    # Export
    # ------------------------------------------------------------------

    def to_factor_factory_panel(
        self,
        *,
        outcome_col: str = "complaint_count",
        provenance: Any | None = None,
        spatial_weights: dict[str, dict[str, float]] | None = None,
    ) -> Any:
        """Convert to a :class:`factor_factory.tidy.Panel`.

        The adapter is additive — ``self`` is unchanged. Treatment events
        are translated to factor-factory's frozen
        :class:`TreatmentEvent` model, and an optional
        ``spatial_weights`` dict (as produced by
        :func:`nyc311.temporal.build_distance_weights`) is stashed on
        ``panel.df.attrs["nyc311_spatial_weights"]`` for in-memory
        round-trip.

        See :mod:`nyc311.temporal._factor_factory` for details on the
        column crosswalk.

        Args:
            outcome_col: Column name to tag as the primary outcome in
                the Panel metadata. Defaults to ``"complaint_count"``.
            provenance: Optional ``factor_factory.tidy.Provenance``
                record. When ``None``, a default pointing at the NYC
                Open Data Socrata endpoint is constructed.
            spatial_weights: Optional nested weights dict from
                :func:`build_distance_weights`.

        Returns:
            A fully-validated ``factor_factory.tidy.Panel``.

        Raises:
            ImportError: If factor-factory or pandas is not installed.
            ValueError: If the dataset is empty or ``outcome_col`` is
                absent from the resulting DataFrame.
        """
        from nyc311.temporal._factor_factory import (
            panel_dataset_to_factor_factory,
        )

        return panel_dataset_to_factor_factory(
            self,
            outcome_col=outcome_col,
            provenance=provenance,
            spatial_weights=spatial_weights,
        )

    def to_dataframe(self) -> Any:
        """Convert to a pandas DataFrame with a ``(unit_id, period)`` MultiIndex.

        Each per-type complaint count is exploded into a
        ``complaints_<type>`` column, and any per-unit covariates are
        merged in as additional columns.

        Returns:
            A ``pandas.DataFrame`` indexed by ``(unit_id, period)`` with
            one column per panel measure. The frame has no rows when the
            dataset is empty.

        Raises:
            ImportError: If pandas is not installed. Install the optional
                dataframes extra with ``pip install nyc311[dataframes]``.
        """
        try:
            import pandas as pd
        except ImportError as exc:
            message = (
                "pandas is required for to_dataframe(). "
                "Install it with: pip install nyc311[dataframes]"
            )
            raise ImportError(message) from exc

        rows: list[dict[str, Any]] = []
        for obs in self.observations:
            row: dict[str, Any] = {
                "unit_id": obs.unit_id,
                "period": obs.period,
                "complaint_count": obs.complaint_count,
                "resolution_rate": obs.resolution_rate,
                "median_resolution_days": obs.median_resolution_days,
                "treatment": obs.treatment,
                "population": obs.population,
            }
            for ctype, cnt in obs.complaint_counts_by_type.items():
                row[f"complaints_{ctype}"] = cnt
            if obs.covariates:
                row.update(obs.covariates)
            rows.append(row)

        df = pd.DataFrame(rows)
        if not df.empty:
            df = df.set_index(["unit_id", "period"])
        return df

observations instance-attribute

observations: tuple[PanelObservation, ...]

unit_type instance-attribute

unit_type: str

periods instance-attribute

periods: tuple[str, ...]

treatment_events class-attribute instance-attribute

treatment_events: tuple[TreatmentEvent, ...] = ()

unit_ids property

unit_ids: tuple[str, ...]

The sorted, unique unit identifiers in the dataset.

Returns:

Type Description
str

A tuple of distinct unit_id values from

...

observations, in lexicographic order.

treatment_group

treatment_group() -> PanelDataset

Return only observations in units that were ever treated.

Returns:

Type Description
PanelDataset

A new :class:PanelDataset whose observations are

PanelDataset

restricted to units with a non-null treatment_date. The

PanelDataset

periods and treatment_events fields are preserved.

Source code in src/nyc311/temporal/_models.py
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
def treatment_group(self) -> PanelDataset:
    """Return only observations in units that were ever treated.

    Returns:
        A new :class:`PanelDataset` whose ``observations`` are
        restricted to units with a non-null ``treatment_date``. The
        ``periods`` and ``treatment_events`` fields are preserved.
    """
    treated_ids = {
        obs.unit_id for obs in self.observations if obs.treatment_date is not None
    }
    return PanelDataset(
        observations=tuple(
            o for o in self.observations if o.unit_id in treated_ids
        ),
        unit_type=self.unit_type,
        periods=self.periods,
        treatment_events=self.treatment_events,
    )

control_group

control_group() -> PanelDataset

Return only observations in units that were never treated.

Returns:

Type Description
PanelDataset

A new :class:PanelDataset whose observations are

PanelDataset

restricted to units with no treatment_date. The

PanelDataset

periods and treatment_events fields are preserved.

Source code in src/nyc311/temporal/_models.py
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
def control_group(self) -> PanelDataset:
    """Return only observations in units that were never treated.

    Returns:
        A new :class:`PanelDataset` whose ``observations`` are
        restricted to units with no ``treatment_date``. The
        ``periods`` and ``treatment_events`` fields are preserved.
    """
    treated_ids = {
        obs.unit_id for obs in self.observations if obs.treatment_date is not None
    }
    return PanelDataset(
        observations=tuple(
            o for o in self.observations if o.unit_id not in treated_ids
        ),
        unit_type=self.unit_type,
        periods=self.periods,
        treatment_events=self.treatment_events,
    )

filter_periods

filter_periods(start: str, end: str) -> PanelDataset

Restrict the dataset to a closed interval of periods.

Parameters:

Name Type Description Default
start str

Inclusive lower-bound period label.

required
end str

Inclusive upper-bound period label.

required

Returns:

Type Description
PanelDataset

A new :class:PanelDataset whose observations and

PanelDataset

periods are limited to labels p satisfying

PanelDataset

start <= p <= end.

Source code in src/nyc311/temporal/_models.py
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
def filter_periods(self, start: str, end: str) -> PanelDataset:
    """Restrict the dataset to a closed interval of periods.

    Args:
        start: Inclusive lower-bound period label.
        end: Inclusive upper-bound period label.

    Returns:
        A new :class:`PanelDataset` whose ``observations`` and
        ``periods`` are limited to labels ``p`` satisfying
        ``start <= p <= end``.
    """
    filtered_periods = tuple(p for p in self.periods if start <= p <= end)
    return PanelDataset(
        observations=tuple(
            o for o in self.observations if start <= o.period <= end
        ),
        unit_type=self.unit_type,
        periods=filtered_periods,
        treatment_events=self.treatment_events,
    )

to_factor_factory_panel

to_factor_factory_panel(
    *,
    outcome_col: str = "complaint_count",
    provenance: Any | None = None,
    spatial_weights: dict[str, dict[str, float]]
    | None = None,
) -> Any

Convert to a :class:factor_factory.tidy.Panel.

The adapter is additive — self is unchanged. Treatment events are translated to factor-factory's frozen :class:TreatmentEvent model, and an optional spatial_weights dict (as produced by :func:nyc311.temporal.build_distance_weights) is stashed on panel.df.attrs["nyc311_spatial_weights"] for in-memory round-trip.

See :mod:nyc311.temporal._factor_factory for details on the column crosswalk.

Parameters:

Name Type Description Default
outcome_col str

Column name to tag as the primary outcome in the Panel metadata. Defaults to "complaint_count".

'complaint_count'
provenance Any | None

Optional factor_factory.tidy.Provenance record. When None, a default pointing at the NYC Open Data Socrata endpoint is constructed.

None
spatial_weights dict[str, dict[str, float]] | None

Optional nested weights dict from :func:build_distance_weights.

None

Returns:

Type Description
Any

A fully-validated factor_factory.tidy.Panel.

Raises:

Type Description
ImportError

If factor-factory or pandas is not installed.

ValueError

If the dataset is empty or outcome_col is absent from the resulting DataFrame.

Source code in src/nyc311/temporal/_models.py
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
def to_factor_factory_panel(
    self,
    *,
    outcome_col: str = "complaint_count",
    provenance: Any | None = None,
    spatial_weights: dict[str, dict[str, float]] | None = None,
) -> Any:
    """Convert to a :class:`factor_factory.tidy.Panel`.

    The adapter is additive — ``self`` is unchanged. Treatment events
    are translated to factor-factory's frozen
    :class:`TreatmentEvent` model, and an optional
    ``spatial_weights`` dict (as produced by
    :func:`nyc311.temporal.build_distance_weights`) is stashed on
    ``panel.df.attrs["nyc311_spatial_weights"]`` for in-memory
    round-trip.

    See :mod:`nyc311.temporal._factor_factory` for details on the
    column crosswalk.

    Args:
        outcome_col: Column name to tag as the primary outcome in
            the Panel metadata. Defaults to ``"complaint_count"``.
        provenance: Optional ``factor_factory.tidy.Provenance``
            record. When ``None``, a default pointing at the NYC
            Open Data Socrata endpoint is constructed.
        spatial_weights: Optional nested weights dict from
            :func:`build_distance_weights`.

    Returns:
        A fully-validated ``factor_factory.tidy.Panel``.

    Raises:
        ImportError: If factor-factory or pandas is not installed.
        ValueError: If the dataset is empty or ``outcome_col`` is
            absent from the resulting DataFrame.
    """
    from nyc311.temporal._factor_factory import (
        panel_dataset_to_factor_factory,
    )

    return panel_dataset_to_factor_factory(
        self,
        outcome_col=outcome_col,
        provenance=provenance,
        spatial_weights=spatial_weights,
    )

to_dataframe

to_dataframe() -> Any

Convert to a pandas DataFrame with a (unit_id, period) MultiIndex.

Each per-type complaint count is exploded into a complaints_<type> column, and any per-unit covariates are merged in as additional columns.

Returns:

Type Description
Any

A pandas.DataFrame indexed by (unit_id, period) with

Any

one column per panel measure. The frame has no rows when the

Any

dataset is empty.

Raises:

Type Description
ImportError

If pandas is not installed. Install the optional dataframes extra with pip install nyc311[dataframes].

Source code in src/nyc311/temporal/_models.py
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
def to_dataframe(self) -> Any:
    """Convert to a pandas DataFrame with a ``(unit_id, period)`` MultiIndex.

    Each per-type complaint count is exploded into a
    ``complaints_<type>`` column, and any per-unit covariates are
    merged in as additional columns.

    Returns:
        A ``pandas.DataFrame`` indexed by ``(unit_id, period)`` with
        one column per panel measure. The frame has no rows when the
        dataset is empty.

    Raises:
        ImportError: If pandas is not installed. Install the optional
            dataframes extra with ``pip install nyc311[dataframes]``.
    """
    try:
        import pandas as pd
    except ImportError as exc:
        message = (
            "pandas is required for to_dataframe(). "
            "Install it with: pip install nyc311[dataframes]"
        )
        raise ImportError(message) from exc

    rows: list[dict[str, Any]] = []
    for obs in self.observations:
        row: dict[str, Any] = {
            "unit_id": obs.unit_id,
            "period": obs.period,
            "complaint_count": obs.complaint_count,
            "resolution_rate": obs.resolution_rate,
            "median_resolution_days": obs.median_resolution_days,
            "treatment": obs.treatment,
            "population": obs.population,
        }
        for ctype, cnt in obs.complaint_counts_by_type.items():
            row[f"complaints_{ctype}"] = cnt
        if obs.covariates:
            row.update(obs.covariates)
        rows.append(row)

    df = pd.DataFrame(rows)
    if not df.empty:
        df = df.set_index(["unit_id", "period"])
    return df

PanelObservation dataclass

One row in a balanced panel: (geographic_unit x time_period).

Source code in src/nyc311/temporal/_models.py
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
@dataclass(frozen=True, slots=True)
class PanelObservation:
    """One row in a balanced panel: (geographic_unit x time_period)."""

    #: Stable identifier of the geographic unit (community district code,
    #: NTA code, borough name, etc.).
    unit_id: str
    #: Period label (for example ``"2024-03"`` for monthly panels).
    period: str
    #: Total number of complaints in this unit/period cell.
    complaint_count: int
    #: Per-complaint-type counts within this cell.
    complaint_counts_by_type: dict[str, int]
    #: Fraction of complaints with a non-null ``resolution_description``.
    resolution_rate: float
    #: Median days from creation to period-end across resolved complaints,
    #: or ``None`` when no complaint in the cell was resolved.
    median_resolution_days: float | None
    #: ``True`` once the unit has been exposed to a treatment event.
    treatment: bool
    #: Date the unit was first treated, or ``None`` if never treated.
    treatment_date: date | None
    #: Total population for the unit, when supplied.
    population: int | None
    #: Optional time-invariant covariates merged in at panel-build time.
    covariates: dict[str, float] | None = None

unit_id instance-attribute

unit_id: str

period instance-attribute

period: str

complaint_count instance-attribute

complaint_count: int

complaint_counts_by_type instance-attribute

complaint_counts_by_type: dict[str, int]

resolution_rate instance-attribute

resolution_rate: float

median_resolution_days instance-attribute

median_resolution_days: float | None

treatment instance-attribute

treatment: bool

treatment_date instance-attribute

treatment_date: date | None

population instance-attribute

population: int | None

covariates class-attribute instance-attribute

covariates: dict[str, float] | None = None

TreatmentEvent dataclass

A policy intervention applied to specific geographic units.

Source code in src/nyc311/temporal/_models.py
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
@dataclass(frozen=True, slots=True)
class TreatmentEvent:
    """A policy intervention applied to specific geographic units."""

    name: str
    description: str
    treated_units: tuple[str, ...]
    treatment_date: date
    geography: str

    def __post_init__(self) -> None:
        if not self.name.strip():
            raise ValueError("Treatment name must not be empty.")
        if not self.treated_units:
            raise ValueError("treated_units must contain at least one unit.")

name instance-attribute

name: str

description instance-attribute

description: str

treated_units instance-attribute

treated_units: tuple[str, ...]

treatment_date instance-attribute

treatment_date: date

geography instance-attribute

geography: str

panel_dataset_to_factor_factory

panel_dataset_to_factor_factory(
    dataset: PanelDataset,
    *,
    outcome_col: str = "complaint_count",
    provenance: Provenance | None = None,
    spatial_weights: dict[str, dict[str, float]]
    | None = None,
) -> ff_tidy.Panel

Convert a :class:PanelDataset to a :class:factor_factory.tidy.Panel.

Maps nyc311's panel model onto factor-factory's tidy Panel contract:

  • unit_id → Panel first-level MultiIndex, named unit_id.
  • period (string label) → pandas Timestamp at the period start, second-level index named period.
  • complaint_count → primary outcome column (configurable via outcome_col).
  • treatment (bool) → int 0/1 column named treatment.
  • resolution_rate, median_resolution_days, population, per-type complaint counts, and covariates flow through as additional columns the engine can consume as covariates.
  • TreatmentEvent tuples are translated to factor-factory's frozen :class:TreatmentEvent pydantic model (geography maps to dimension).
  • A spatial_weights dict (as produced by :func:nyc311.temporal.build_distance_weights) is attached to the resulting :attr:Panel.df.attrs under the key "nyc311_spatial_weights" for in-memory round-trip.

Parameters:

Name Type Description Default
dataset PanelDataset

The balanced :class:PanelDataset to convert.

required
outcome_col str

Column name to tag as the primary outcome in the Panel metadata. Must be one of "complaint_count", "resolution_rate", "median_resolution_days", or a "complaints_<type>" column present on the observations.

'complaint_count'
provenance Provenance | None

Optional factor-factory :class:Provenance record describing the dataset. When None, a default is constructed pointing at the NYC Open Data 311 Socrata endpoint.

None
spatial_weights dict[str, dict[str, float]] | None

Optional nested dict as produced by :func:build_distance_weights. Stashed on panel.df.attrs["nyc311_spatial_weights"] so downstream code can pick it up without a second computation.

None

Returns:

Type Description
Panel

A fully-validated :class:factor_factory.tidy.Panel.

Raises:

Type Description
ImportError

If factor-factory or pandas is not installed.

ValueError

If dataset is empty or outcome_col is not present on the first observation.

Source code in src/nyc311/temporal/_factor_factory.py
 45
 46
 47
 48
 49
 50
 51
 52
 53
 54
 55
 56
 57
 58
 59
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
def panel_dataset_to_factor_factory(
    dataset: PanelDataset,
    *,
    outcome_col: str = "complaint_count",
    provenance: ff_tidy.Provenance | None = None,
    spatial_weights: dict[str, dict[str, float]] | None = None,
) -> ff_tidy.Panel:
    """Convert a :class:`PanelDataset` to a :class:`factor_factory.tidy.Panel`.

    Maps nyc311's panel model onto factor-factory's tidy Panel contract:

    - ``unit_id`` → Panel first-level MultiIndex, named ``unit_id``.
    - ``period`` (string label) → pandas Timestamp at the period start,
      second-level index named ``period``.
    - ``complaint_count`` → primary outcome column (configurable via
      ``outcome_col``).
    - ``treatment`` (bool) → int 0/1 column named ``treatment``.
    - ``resolution_rate``, ``median_resolution_days``, ``population``,
      per-type complaint counts, and covariates flow through as
      additional columns the engine can consume as covariates.
    - ``TreatmentEvent`` tuples are translated to factor-factory's
      frozen :class:`TreatmentEvent` pydantic model (``geography`` maps
      to ``dimension``).
    - A ``spatial_weights`` dict (as produced by
      :func:`nyc311.temporal.build_distance_weights`) is attached to
      the resulting :attr:`Panel.df.attrs` under the key
      ``"nyc311_spatial_weights"`` for in-memory round-trip.

    Args:
        dataset: The balanced :class:`PanelDataset` to convert.
        outcome_col: Column name to tag as the primary outcome in the
            Panel metadata. Must be one of ``"complaint_count"``,
            ``"resolution_rate"``, ``"median_resolution_days"``, or a
            ``"complaints_<type>"`` column present on the observations.
        provenance: Optional factor-factory :class:`Provenance` record
            describing the dataset. When ``None``, a default is
            constructed pointing at the NYC Open Data 311 Socrata
            endpoint.
        spatial_weights: Optional nested dict as produced by
            :func:`build_distance_weights`. Stashed on
            ``panel.df.attrs["nyc311_spatial_weights"]`` so downstream
            code can pick it up without a second computation.

    Returns:
        A fully-validated :class:`factor_factory.tidy.Panel`.

    Raises:
        ImportError: If ``factor-factory`` or pandas is not installed.
        ValueError: If ``dataset`` is empty or ``outcome_col`` is not
            present on the first observation.
    """
    try:
        import pandas as pd
        from factor_factory.tidy import (
            Panel,
            PanelMetadata,
            Provenance,
        )
        from factor_factory.tidy import (
            TreatmentEvent as FFTreatmentEvent,
        )
    except ImportError as exc:
        message = (
            "factor-factory and pandas are required for "
            "PanelDataset.to_factor_factory_panel(). "
            "Install with: pip install nyc311"
        )
        raise ImportError(message) from exc

    if not dataset.observations:
        message = "Cannot convert an empty PanelDataset to a factor-factory Panel."
        raise ValueError(message)

    rows: list[dict[str, Any]] = []
    for obs in dataset.observations:
        row: dict[str, Any] = {
            "unit_id": obs.unit_id,
            "period": _period_to_timestamp(obs.period),
            "complaint_count": obs.complaint_count,
            "resolution_rate": obs.resolution_rate,
            "treatment": int(obs.treatment),
        }
        if obs.median_resolution_days is not None:
            row["median_resolution_days"] = obs.median_resolution_days
        if obs.population is not None:
            row["population"] = obs.population
        for ctype, cnt in obs.complaint_counts_by_type.items():
            row[f"complaints_{ctype}"] = cnt
        if obs.covariates:
            row.update(obs.covariates)
        rows.append(row)

    df = pd.DataFrame(rows).set_index(["unit_id", "period"]).sort_index()

    if outcome_col not in df.columns:
        message = (
            f"outcome_col={outcome_col!r} not in panel columns. "
            f"Available: {sorted(df.columns)}"
        )
        raise ValueError(message)

    ff_events = tuple(
        FFTreatmentEvent(
            name=ev.name,
            description=ev.description,
            treated_units=tuple(ev.treated_units),
            treatment_date=ev.treatment_date,
            dimension=ev.geography,
        )
        for ev in dataset.treatment_events
    )

    if provenance is None:
        provenance = Provenance(
            data_source="NYC Open Data — 311 Service Requests (Socrata erm2-nwe9)",
            license="CC0-1.0",
            creator="nyc311.temporal.PanelDataset",
            citation="https://opendata.cityofnewyork.us/",
        )

    metadata = PanelMetadata(
        outcome_cols=(outcome_col,),
        period_kind="timestamp",
        freq=_infer_freq(dataset.periods),
        dimension=dataset.unit_type,
        treatment_events=ff_events,
        record_count=len(dataset.observations),
        provenance=provenance,
    )

    panel = Panel(df, metadata)

    if spatial_weights is not None:
        panel.df.attrs[_SPATIAL_WEIGHTS_ATTR] = dict(spatial_weights)

    return panel

spatial_weights_from_panel

spatial_weights_from_panel(
    panel: Panel,
) -> dict[str, dict[str, float]] | None

Recover spatial weights previously attached via the adapter.

Parameters:

Name Type Description Default
panel Panel

A :class:factor_factory.tidy.Panel that was produced by :func:panel_dataset_to_factor_factory with spatial_weights supplied.

required

Returns:

Type Description
dict[str, dict[str, float]] | None

The nested weights dict, or None if no spatial weights were

dict[str, dict[str, float]] | None

attached.

Source code in src/nyc311/temporal/_factor_factory.py
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
def spatial_weights_from_panel(
    panel: ff_tidy.Panel,
) -> dict[str, dict[str, float]] | None:
    """Recover spatial weights previously attached via the adapter.

    Args:
        panel: A :class:`factor_factory.tidy.Panel` that was produced by
            :func:`panel_dataset_to_factor_factory` with
            ``spatial_weights`` supplied.

    Returns:
        The nested weights dict, or ``None`` if no spatial weights were
        attached.
    """
    weights = panel.df.attrs.get(_SPATIAL_WEIGHTS_ATTR)
    if weights is None:
        return None
    return dict(weights)

build_complaint_panel

build_complaint_panel(
    records: Sequence[ServiceRequestRecord],
    *,
    geography: str = "community_district",
    freq: str = "ME",
    treatment_events: Sequence[TreatmentEvent] = (),
    population_data: dict[str, int] | None = None,
    covariates: dict[str, dict[str, float]] | None = None,
) -> PanelDataset

Construct a balanced panel from service-request records.

Aggregates records into one observation per (geographic-unit, period) cell, filling missing cells so the resulting :class:PanelDataset is fully balanced across both dimensions.

Parameters:

Name Type Description Default
records Sequence[ServiceRequestRecord]

Raw complaint records to aggregate.

required
geography str

Geographic unit to group by; one of "borough" or "community_district".

'community_district'
freq str

Pandas offset alias controlling the period length ("ME" for monthly, "QE" for quarterly, "YE" for yearly). Both legacy ("M") and modern ("ME") aliases are accepted.

'ME'
treatment_events Sequence[TreatmentEvent]

Policy interventions to code as treatment indicators on each observation.

()
population_data dict[str, int] | None

Mapping {unit_id: total_population} used to populate :attr:PanelObservation.population for per-capita downstream analyses.

None
covariates dict[str, dict[str, float]] | None

Mapping {unit_id: {name: value}} of time-invariant demographic covariates to attach to each observation in a unit.

None

Returns:

Name Type Description
A PanelDataset

class:PanelDataset with one observation per ``(unit,

PanelDataset

period). Whenrecords`` is empty the returned dataset has no

PanelDataset

observations and no periods.

Raises:

Type Description
ImportError

If pandas is not installed. Install the optional dataframes extra with pip install nyc311[dataframes].

Source code in src/nyc311/temporal/_panel.py
 32
 33
 34
 35
 36
 37
 38
 39
 40
 41
 42
 43
 44
 45
 46
 47
 48
 49
 50
 51
 52
 53
 54
 55
 56
 57
 58
 59
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
def build_complaint_panel(
    records: Sequence[ServiceRequestRecord],
    *,
    geography: str = "community_district",
    freq: str = "ME",
    treatment_events: Sequence[TreatmentEvent] = (),
    population_data: dict[str, int] | None = None,
    covariates: dict[str, dict[str, float]] | None = None,
) -> PanelDataset:
    """Construct a balanced panel from service-request records.

    Aggregates ``records`` into one observation per
    (geographic-unit, period) cell, filling missing cells so the
    resulting :class:`PanelDataset` is fully balanced across both
    dimensions.

    Args:
        records: Raw complaint records to aggregate.
        geography: Geographic unit to group by; one of ``"borough"`` or
            ``"community_district"``.
        freq: Pandas offset alias controlling the period length
            (``"ME"`` for monthly, ``"QE"`` for quarterly, ``"YE"`` for
            yearly). Both legacy (``"M"``) and modern (``"ME"``) aliases
            are accepted.
        treatment_events: Policy interventions to code as treatment
            indicators on each observation.
        population_data: Mapping ``{unit_id: total_population}`` used to
            populate :attr:`PanelObservation.population` for per-capita
            downstream analyses.
        covariates: Mapping ``{unit_id: {name: value}}`` of
            time-invariant demographic covariates to attach to each
            observation in a unit.

    Returns:
        A :class:`PanelDataset` with one observation per ``(unit,
        period)``. When ``records`` is empty the returned dataset has no
        observations and no periods.

    Raises:
        ImportError: If pandas is not installed. Install the optional
            dataframes extra with ``pip install nyc311[dataframes]``.
    """
    try:
        import pandas as pd
    except ImportError as exc:
        message = (
            "pandas is required for build_complaint_panel(). "
            "Install it with: pip install nyc311[dataframes]"
        )
        raise ImportError(message) from exc

    norm_freq = _normalize_freq(freq)

    # -- group records by (unit, period) ----------------------------------
    grouped: dict[tuple[str, str], list[ServiceRequestRecord]] = defaultdict(list)
    all_units: set[str] = set()

    for rec in records:
        unit = rec.geography_value(geography)
        period = pd.Timestamp(rec.created_date).to_period(norm_freq)
        period_label = str(period)
        all_units.add(unit)
        grouped[(unit, period_label)].append(rec)

    if not all_units:
        return PanelDataset(
            observations=(),
            unit_type=geography,
            periods=(),
            treatment_events=tuple(treatment_events),
        )

    # -- determine ordered period labels ----------------------------------
    all_periods: set[str] = set()
    for _unit, period_label in grouped:
        all_periods.add(period_label)

    ordered_periods = tuple(sorted(all_periods))

    # -- build treatment lookup -------------------------------------------
    treatment_lookup: dict[str, date] = {}
    for event in treatment_events:
        for unit in event.treated_units:
            existing = treatment_lookup.get(unit)
            if existing is None or event.treatment_date < existing:
                treatment_lookup[unit] = event.treatment_date

    # -- build balanced panel ---------------------------------------------
    pops = population_data or {}
    covs = covariates or {}
    observations: list[PanelObservation] = []

    for unit in sorted(all_units):
        unit_treatment_date = treatment_lookup.get(unit)
        for period_label in ordered_periods:
            recs = grouped.get((unit, period_label), [])

            complaint_count = len(recs)
            type_counts: Counter[str] = Counter(r.complaint_type for r in recs)

            resolved = [r for r in recs if r.resolution_description is not None]
            resolution_rate = (
                len(resolved) / complaint_count if complaint_count else 0.0
            )

            if resolved:
                period_obj = pd.Period(period_label, freq=norm_freq)
                period_end = period_obj.end_time.date()
                days_list = [
                    max((period_end - r.created_date).days, 0) for r in resolved
                ]
                med_days: float | None = median(days_list)
            else:
                med_days = None

            is_treated = False
            if unit_treatment_date is not None:
                try:
                    period_obj = pd.Period(period_label, freq=norm_freq)
                    period_start = period_obj.start_time.date()
                    is_treated = period_start >= unit_treatment_date
                except Exception:  # noqa: BLE001
                    pass

            observations.append(
                PanelObservation(
                    unit_id=unit,
                    period=period_label,
                    complaint_count=complaint_count,
                    complaint_counts_by_type=dict(type_counts),
                    resolution_rate=resolution_rate,
                    median_resolution_days=med_days,
                    treatment=is_treated,
                    treatment_date=unit_treatment_date,
                    population=pops.get(unit),
                    covariates=covs.get(unit),
                )
            )

    return PanelDataset(
        observations=tuple(observations),
        unit_type=geography,
        periods=ordered_periods,
        treatment_events=tuple(treatment_events),
    )

build_distance_weights

build_distance_weights(
    unit_centroids: dict[str, tuple[float, float]],
    *,
    threshold_meters: float = 2000.0,
    row_standardize: bool = True,
) -> dict[str, dict[str, float]]

Build an inverse-distance spatial weights matrix.

Units within threshold_meters are neighbors, weighted by 1 / distance. The resulting matrix is symmetric before row-standardization.

Parameters:

Name Type Description Default
unit_centroids dict[str, tuple[float, float]]

Mapping {unit_id: (latitude, longitude)} of unit centroids in WGS84 degrees.

required
threshold_meters float

Maximum great-circle distance, in meters, for two units to be considered neighbors.

2000.0
row_standardize bool

If True, normalize each row of the resulting weights matrix to sum to 1.0.

True

Returns:

Type Description
dict[str, dict[str, float]]

Nested dictionary {unit_a: {unit_b: weight}}. Units with no

dict[str, dict[str, float]]

neighbors map to an empty inner dict.

Source code in src/nyc311/temporal/_spatial_weights.py
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
def build_distance_weights(
    unit_centroids: dict[str, tuple[float, float]],
    *,
    threshold_meters: float = 2000.0,
    row_standardize: bool = True,
) -> dict[str, dict[str, float]]:
    """Build an inverse-distance spatial weights matrix.

    Units within ``threshold_meters`` are neighbors, weighted by
    ``1 / distance``. The resulting matrix is symmetric before
    row-standardization.

    Args:
        unit_centroids: Mapping ``{unit_id: (latitude, longitude)}`` of
            unit centroids in WGS84 degrees.
        threshold_meters: Maximum great-circle distance, in meters, for
            two units to be considered neighbors.
        row_standardize: If ``True``, normalize each row of the resulting
            weights matrix to sum to ``1.0``.

    Returns:
        Nested dictionary ``{unit_a: {unit_b: weight}}``. Units with no
        neighbors map to an empty inner dict.
    """
    unit_ids = sorted(unit_centroids)
    raw: dict[str, dict[str, float]] = {uid: {} for uid in unit_ids}

    for i, uid_a in enumerate(unit_ids):
        lat_a, lon_a = unit_centroids[uid_a]
        for uid_b in unit_ids[i + 1 :]:
            lat_b, lon_b = unit_centroids[uid_b]
            dist = haversine_distance_meters(
                latitude_a=lat_a,
                longitude_a=lon_a,
                latitude_b=lat_b,
                longitude_b=lon_b,
            )
            if 0 < dist <= threshold_meters:
                w = 1.0 / dist
                raw[uid_a][uid_b] = w
                raw[uid_b][uid_a] = w

    if row_standardize:
        for uid in unit_ids:
            row_sum = sum(raw[uid].values())
            if row_sum > 0:
                raw[uid] = {nb: w / row_sum for nb, w in raw[uid].items()}

    return raw

centroids_from_boundaries

centroids_from_boundaries(
    boundaries: Any,
) -> dict[str, tuple[float, float]]

Extract centroids from a :class:BoundaryCollection.

Computes a per-feature centroid as the mean of the exterior-ring coordinates. This is approximate but cheap and avoids a hard dependency on shapely.

.. note::

As of nyc-geo-toolkit v0.4.0,
:func:`nyc_geo_toolkit.centroids_from_boundaries` is available
as a shapely-backed, publication-grade centroid helper — it
returns a :class:`BoundaryCollection` of GeoJSON ``Point``
features at either the geometric centroid (default) or
shapely's ``representative_point`` (guaranteed to lie inside
concave polygons such as NYC's jagged community districts).
Prefer it when you already have shapely installed and need
defensible geometry for a published analysis.

nyc311's helper is intentionally the **shapely-free** path
(returns a plain ``dict[str, (lat, lon)]`` suitable for
feeding directly into :func:`build_distance_weights`) and is
preserved for workflows that need to stay on the lean base
install. The two helpers return different shapes and slightly
different numbers; don't swap them mid-analysis.

Parameters:

Name Type Description Default
boundaries Any

A boundary collection exposing a features iterable. Each feature must provide a geometry mapping with "type" ("Polygon" or "MultiPolygon") and "coordinates", plus a geography_value attribute.

required

Returns:

Type Description
dict[str, tuple[float, float]]

Mapping {geography_value: (latitude, longitude)} for every

dict[str, tuple[float, float]]

feature whose exterior ring is non-empty.

Source code in src/nyc311/temporal/_spatial_weights.py
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
def centroids_from_boundaries(boundaries: Any) -> dict[str, tuple[float, float]]:
    """Extract centroids from a :class:`BoundaryCollection`.

    Computes a per-feature centroid as the mean of the exterior-ring
    coordinates. This is approximate but cheap and avoids a hard
    dependency on shapely.

    .. note::

        As of nyc-geo-toolkit v0.4.0,
        :func:`nyc_geo_toolkit.centroids_from_boundaries` is available
        as a shapely-backed, publication-grade centroid helper — it
        returns a :class:`BoundaryCollection` of GeoJSON ``Point``
        features at either the geometric centroid (default) or
        shapely's ``representative_point`` (guaranteed to lie inside
        concave polygons such as NYC's jagged community districts).
        Prefer it when you already have shapely installed and need
        defensible geometry for a published analysis.

        nyc311's helper is intentionally the **shapely-free** path
        (returns a plain ``dict[str, (lat, lon)]`` suitable for
        feeding directly into :func:`build_distance_weights`) and is
        preserved for workflows that need to stay on the lean base
        install. The two helpers return different shapes and slightly
        different numbers; don't swap them mid-analysis.

    Args:
        boundaries: A boundary collection exposing a ``features``
            iterable. Each feature must provide a ``geometry`` mapping
            with ``"type"`` (``"Polygon"`` or ``"MultiPolygon"``) and
            ``"coordinates"``, plus a ``geography_value`` attribute.

    Returns:
        Mapping ``{geography_value: (latitude, longitude)}`` for every
        feature whose exterior ring is non-empty.
    """
    centroids: dict[str, tuple[float, float]] = {}
    for feature in boundaries.features:
        coords = feature.geometry.get("coordinates", [])
        if not coords:
            continue
        ring = coords[0] if feature.geometry.get("type") == "Polygon" else coords[0][0]
        if not ring:
            continue
        lons = [pt[0] for pt in ring]
        lats = [pt[1] for pt in ring]
        centroids[feature.geography_value] = (
            sum(lats) / len(lats),
            sum(lons) / len(lons),
        )
    return centroids

weights_to_pysal

weights_to_pysal(
    weights: dict[str, dict[str, float]],
) -> Any

Convert a weights dict to a :class:libpysal.weights.W object.

Parameters:

Name Type Description Default
weights dict[str, dict[str, float]]

Nested dictionary {unit_a: {unit_b: weight}} as produced by :func:build_distance_weights.

required

Returns:

Type Description
Any

A libpysal.weights.W instance suitable for use with PySAL's

Any

spatial autocorrelation routines.

Raises:

Type Description
ImportError

If libpysal is not installed. Install the optional stats extra with pip install nyc311[stats].

Source code in src/nyc311/temporal/_spatial_weights.py
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
def weights_to_pysal(weights: dict[str, dict[str, float]]) -> Any:
    """Convert a weights dict to a :class:`libpysal.weights.W` object.

    Args:
        weights: Nested dictionary ``{unit_a: {unit_b: weight}}`` as
            produced by :func:`build_distance_weights`.

    Returns:
        A ``libpysal.weights.W`` instance suitable for use with PySAL's
        spatial autocorrelation routines.

    Raises:
        ImportError: If libpysal is not installed. Install the optional
            stats extra with ``pip install nyc311[stats]``.
    """
    try:
        from libpysal.weights import W
    except ImportError as exc:
        message = (
            "libpysal is required for spatial weights. "
            "Install with: pip install nyc311[stats]"
        )
        raise ImportError(message) from exc

    neighbors = {uid: list(nbrs) for uid, nbrs in weights.items()}
    weight_values = {uid: list(nbrs.values()) for uid, nbrs in weights.items()}
    return W(neighbors, weight_values)

Stats

nyc311.stats

PhD-level statistical modeling for NYC 311 complaint analysis.

STLAnomalyResult dataclass

Result of STL-residual anomaly detection.

Source code in src/nyc311/stats/_anomaly.py
18
19
20
21
22
23
24
25
26
27
@dataclass(frozen=True, slots=True)
class STLAnomalyResult:
    """Result of STL-residual anomaly detection."""

    anomaly_dates: tuple[Any, ...]
    anomaly_scores: tuple[float, ...]
    threshold: float
    n_anomalies: int
    residual_mean: float
    residual_std: float

anomaly_dates instance-attribute

anomaly_dates: tuple[Any, ...]

anomaly_scores instance-attribute

anomaly_scores: tuple[float, ...]

threshold instance-attribute

threshold: float

n_anomalies instance-attribute

n_anomalies: int

residual_mean instance-attribute

residual_mean: float

residual_std instance-attribute

residual_std: float

BYM2Result dataclass

Result of BYM2 small-area smoothing.

Source code in src/nyc311/stats/_bym2.py
23
24
25
26
27
28
29
30
31
32
33
34
35
@dataclass(frozen=True, slots=True)
class BYM2Result:
    """Result of BYM2 small-area smoothing."""

    smoothed_rates: dict[str, float]
    credible_lower: dict[str, float]
    credible_upper: dict[str, float]
    mixing_parameter: float
    spatial_variance: float
    iid_variance: float
    unit_ids: tuple[str, ...]
    n_samples: int
    model_summary: str

smoothed_rates instance-attribute

smoothed_rates: dict[str, float]

credible_lower instance-attribute

credible_lower: dict[str, float]

credible_upper instance-attribute

credible_upper: dict[str, float]

mixing_parameter instance-attribute

mixing_parameter: float

spatial_variance instance-attribute

spatial_variance: float

iid_variance instance-attribute

iid_variance: float

unit_ids instance-attribute

unit_ids: tuple[str, ...]

n_samples instance-attribute

n_samples: int

model_summary instance-attribute

model_summary: str

ChangepointResult dataclass

Detected structural breaks in a time series.

Source code in src/nyc311/stats/_changepoint.py
25
26
27
28
29
30
31
32
@dataclass(frozen=True, slots=True)
class ChangepointResult:
    """Detected structural breaks in a time series."""

    breakpoints: tuple[int, ...]
    breakpoint_dates: tuple[date, ...]
    n_segments: int
    penalty: float

breakpoints instance-attribute

breakpoints: tuple[int, ...]

breakpoint_dates instance-attribute

breakpoint_dates: tuple[date, ...]

n_segments instance-attribute

n_segments: int

penalty instance-attribute

penalty: float

DecompositionResult dataclass

Seasonal + trend + residual decomposition.

Source code in src/nyc311/stats/_decomposition.py
24
25
26
27
28
29
30
31
@dataclass(frozen=True, slots=True)
class DecompositionResult:
    """Seasonal + trend + residual decomposition."""

    trend: Any
    seasonal: Any
    residual: Any
    period: int

trend instance-attribute

trend: Any

seasonal instance-attribute

seasonal: Any

residual instance-attribute

residual: Any

period instance-attribute

period: int

OaxacaBlinderResult dataclass

Oaxaca-Blinder decomposition of an outcome gap.

Source code in src/nyc311/stats/_equity.py
32
33
34
35
36
37
38
39
40
41
42
43
@dataclass(frozen=True, slots=True)
class OaxacaBlinderResult:
    """Oaxaca-Blinder decomposition of an outcome gap."""

    mean_group_a: float
    mean_group_b: float
    total_gap: float
    explained: float
    unexplained: float
    component_contributions: dict[str, float]
    n_group_a: int
    n_group_b: int

mean_group_a instance-attribute

mean_group_a: float

mean_group_b instance-attribute

mean_group_b: float

total_gap instance-attribute

total_gap: float

explained instance-attribute

explained: float

unexplained instance-attribute

unexplained: float

component_contributions instance-attribute

component_contributions: dict[str, float]

n_group_a instance-attribute

n_group_a: int

n_group_b instance-attribute

n_group_b: int

TheilResult dataclass

Population-weighted Theil T index with group decomposition.

Source code in src/nyc311/stats/_equity.py
46
47
48
49
50
51
52
53
54
@dataclass(frozen=True, slots=True)
class TheilResult:
    """Population-weighted Theil T index with group decomposition."""

    total: float
    between_group: float
    within_group: float
    unit_contributions: dict[str, float]
    n_units: int

total instance-attribute

total: float

between_group instance-attribute

between_group: float

within_group instance-attribute

within_group: float

unit_contributions instance-attribute

unit_contributions: dict[str, float]

n_units instance-attribute

n_units: int

GWRResult dataclass

Result of a geographically weighted regression.

Source code in src/nyc311/stats/_gwr.py
23
24
25
26
27
28
29
30
31
32
33
34
@dataclass(frozen=True, slots=True)
class GWRResult:
    """Result of a geographically weighted regression."""

    local_coefficients: dict[str, tuple[float, ...]]
    local_r_squared: tuple[float, ...]
    bandwidth: float
    aic: float
    unit_ids: tuple[str, ...]
    global_r_squared: float
    n_observations: int
    model_summary: str

local_coefficients instance-attribute

local_coefficients: dict[str, tuple[float, ...]]

local_r_squared instance-attribute

local_r_squared: tuple[float, ...]

bandwidth instance-attribute

bandwidth: float

aic instance-attribute

aic: float

unit_ids instance-attribute

unit_ids: tuple[str, ...]

global_r_squared instance-attribute

global_r_squared: float

n_observations instance-attribute

n_observations: int

model_summary instance-attribute

model_summary: str

HawkesResult dataclass

Result of a Hawkes process estimation.

Source code in src/nyc311/stats/_hawkes.py
30
31
32
33
34
35
36
37
38
39
40
@dataclass(frozen=True, slots=True)
class HawkesResult:
    """Result of a Hawkes process estimation."""

    background_rate: float
    triggering_kernel_alpha: float
    triggering_kernel_beta: float
    branching_ratio: float
    n_events: int
    log_likelihood: float
    model_summary: str

background_rate instance-attribute

background_rate: float

triggering_kernel_alpha instance-attribute

triggering_kernel_alpha: float

triggering_kernel_beta instance-attribute

triggering_kernel_beta: float

branching_ratio instance-attribute

branching_ratio: float

n_events instance-attribute

n_events: int

log_likelihood instance-attribute

log_likelihood: float

model_summary instance-attribute

model_summary: str

ITSResult dataclass

Result of a segmented interrupted-time-series regression.

Source code in src/nyc311/stats/_its.py
18
19
20
21
22
23
24
25
26
27
28
@dataclass(frozen=True, slots=True)
class ITSResult:
    """Result of a segmented interrupted-time-series regression."""

    pre_trend: float
    post_trend: float
    level_change: float
    trend_change: float
    p_value_level: float
    p_value_trend: float
    model_summary: str

pre_trend instance-attribute

pre_trend: float

post_trend instance-attribute

post_trend: float

level_change instance-attribute

level_change: float

trend_change instance-attribute

trend_change: float

p_value_level instance-attribute

p_value_level: float

p_value_trend instance-attribute

p_value_trend: float

model_summary instance-attribute

model_summary: str

PanelRegressionResult dataclass

Summary of a panel regression fit.

Source code in src/nyc311/stats/_panel_models.py
26
27
28
29
30
31
32
33
34
35
36
37
38
@dataclass(frozen=True, slots=True)
class PanelRegressionResult:
    """Summary of a panel regression fit."""

    method: str
    coefficients: dict[str, float]
    std_errors: dict[str, float]
    p_values: dict[str, float]
    r_squared: float
    n_observations: int
    n_entities: int
    n_periods: int
    model_summary: str

method instance-attribute

method: str

coefficients instance-attribute

coefficients: dict[str, float]

std_errors instance-attribute

std_errors: dict[str, float]

p_values instance-attribute

p_values: dict[str, float]

r_squared instance-attribute

r_squared: float

n_observations instance-attribute

n_observations: int

n_entities instance-attribute

n_entities: int

n_periods instance-attribute

n_periods: int

model_summary instance-attribute

model_summary: str

PowerResult dataclass

Result of a power / minimum detectable effect calculation.

Source code in src/nyc311/stats/_power.py
12
13
14
15
16
17
18
19
20
21
22
@dataclass(frozen=True, slots=True)
class PowerResult:
    """Result of a power / minimum detectable effect calculation."""

    mde: float
    alpha: float
    power: float
    n_units: int
    n_periods: int
    icc: float
    variance_explained: float

mde instance-attribute

mde: float

alpha instance-attribute

alpha: float

power instance-attribute

power: float

n_units instance-attribute

n_units: int

n_periods instance-attribute

n_periods: int

icc instance-attribute

icc: float

variance_explained instance-attribute

variance_explained: float

RDResult dataclass

Result of a regression discontinuity estimation.

Source code in src/nyc311/stats/_rdd.py
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
@dataclass(frozen=True, slots=True)
class RDResult:
    """Result of a regression discontinuity estimation."""

    treatment_effect: float
    se_robust: float
    p_value: float
    ci_lower: float
    ci_upper: float
    bandwidth_left: float
    bandwidth_right: float
    n_effective_left: int
    n_effective_right: int
    kernel: str
    model_summary: str

treatment_effect instance-attribute

treatment_effect: float

se_robust instance-attribute

se_robust: float

p_value instance-attribute

p_value: float

ci_lower instance-attribute

ci_lower: float

ci_upper instance-attribute

ci_upper: float

bandwidth_left instance-attribute

bandwidth_left: float

bandwidth_right instance-attribute

bandwidth_right: float

n_effective_left instance-attribute

n_effective_left: int

n_effective_right instance-attribute

n_effective_right: int

kernel instance-attribute

kernel: str

model_summary instance-attribute

model_summary: str

LatentReportingResult dataclass

Result of latent reporting-bias EM estimation.

Source code in src/nyc311/stats/_reporting_bias.py
48
49
50
51
52
53
54
55
56
57
@dataclass(frozen=True, slots=True)
class LatentReportingResult:
    """Result of latent reporting-bias EM estimation."""

    estimated_true_rates: dict[str, float]
    reporting_probabilities: dict[str, float]
    observed_rates: dict[str, float]
    n_iterations: int
    converged: bool
    log_likelihood_trace: tuple[float, ...]

estimated_true_rates instance-attribute

estimated_true_rates: dict[str, float]

reporting_probabilities instance-attribute

reporting_probabilities: dict[str, float]

observed_rates instance-attribute

observed_rates: dict[str, float]

n_iterations instance-attribute

n_iterations: int

converged instance-attribute

converged: bool

log_likelihood_trace instance-attribute

log_likelihood_trace: tuple[float, ...]

ReportingAdjustmentResult dataclass

Result of ecometric reporting-rate adjustment.

Source code in src/nyc311/stats/_reporting_bias.py
36
37
38
39
40
41
42
43
44
45
@dataclass(frozen=True, slots=True)
class ReportingAdjustmentResult:
    """Result of ecometric reporting-rate adjustment."""

    raw_rates: dict[str, float]
    adjusted_rates: dict[str, float]
    adjustment_factors: dict[str, float]
    covariates_used: tuple[str, ...]
    icc: float
    model_summary: str

raw_rates instance-attribute

raw_rates: dict[str, float]

adjusted_rates instance-attribute

adjusted_rates: dict[str, float]

adjustment_factors instance-attribute

adjustment_factors: dict[str, float]

covariates_used instance-attribute

covariates_used: tuple[str, ...]

icc instance-attribute

icc: float

model_summary instance-attribute

model_summary: str

LISAResult dataclass

Local Indicators of Spatial Association.

Source code in src/nyc311/stats/_spatial.py
35
36
37
38
39
40
41
42
@dataclass(frozen=True, slots=True)
class LISAResult:
    """Local Indicators of Spatial Association."""

    local_statistic: tuple[float, ...]
    p_values: tuple[float, ...]
    cluster_labels: tuple[str, ...]
    unit_ids: tuple[str, ...]

local_statistic instance-attribute

local_statistic: tuple[float, ...]

p_values instance-attribute

p_values: tuple[float, ...]

cluster_labels instance-attribute

cluster_labels: tuple[str, ...]

unit_ids instance-attribute

unit_ids: tuple[str, ...]

MoranResult dataclass

Global Moran's I test result.

Source code in src/nyc311/stats/_spatial.py
25
26
27
28
29
30
31
32
@dataclass(frozen=True, slots=True)
class MoranResult:
    """Global Moran's I test result."""

    statistic: float
    p_value: float
    z_score: float
    expected: float

statistic instance-attribute

statistic: float

p_value instance-attribute

p_value: float

z_score instance-attribute

z_score: float

expected instance-attribute

expected: float

SpatialErrorResult dataclass

Result of a spatial error (SEM) model.

Source code in src/nyc311/stats/_spatial_regression.py
36
37
38
39
40
41
42
43
44
45
46
47
48
@dataclass(frozen=True, slots=True)
class SpatialErrorResult:
    """Result of a spatial error (SEM) model."""

    coefficients: dict[str, float]
    std_errors: dict[str, float]
    p_values: dict[str, float]
    lam: float
    lam_p_value: float
    log_likelihood: float
    aic: float
    n_observations: int
    model_summary: str

coefficients instance-attribute

coefficients: dict[str, float]

std_errors instance-attribute

std_errors: dict[str, float]

p_values instance-attribute

p_values: dict[str, float]

lam instance-attribute

lam: float

lam_p_value instance-attribute

lam_p_value: float

log_likelihood instance-attribute

log_likelihood: float

aic instance-attribute

aic: float

n_observations instance-attribute

n_observations: int

model_summary instance-attribute

model_summary: str

SpatialLagResult dataclass

Result of a spatial lag (SAR) model.

Source code in src/nyc311/stats/_spatial_regression.py
21
22
23
24
25
26
27
28
29
30
31
32
33
@dataclass(frozen=True, slots=True)
class SpatialLagResult:
    """Result of a spatial lag (SAR) model."""

    coefficients: dict[str, float]
    std_errors: dict[str, float]
    p_values: dict[str, float]
    rho: float
    rho_p_value: float
    log_likelihood: float
    aic: float
    n_observations: int
    model_summary: str

coefficients instance-attribute

coefficients: dict[str, float]

std_errors instance-attribute

std_errors: dict[str, float]

p_values instance-attribute

p_values: dict[str, float]

rho instance-attribute

rho: float

rho_p_value instance-attribute

rho_p_value: float

log_likelihood instance-attribute

log_likelihood: float

aic instance-attribute

aic: float

n_observations instance-attribute

n_observations: int

model_summary instance-attribute

model_summary: str

EventStudyResult dataclass

Event-study coefficients with pre-trend diagnostics.

Source code in src/nyc311/stats/_staggered_did.py
50
51
52
53
54
55
56
57
58
59
60
61
@dataclass(frozen=True, slots=True)
class EventStudyResult:
    """Event-study coefficients with pre-trend diagnostics."""

    coefficients: tuple[float, ...]
    std_errors: tuple[float, ...]
    ci_lower: tuple[float, ...]
    ci_upper: tuple[float, ...]
    relative_periods: tuple[int, ...]
    pre_trend_f_statistic: float | None
    pre_trend_p_value: float | None
    reference_period: int

coefficients instance-attribute

coefficients: tuple[float, ...]

std_errors instance-attribute

std_errors: tuple[float, ...]

ci_lower instance-attribute

ci_lower: tuple[float, ...]

ci_upper instance-attribute

ci_upper: tuple[float, ...]

relative_periods instance-attribute

relative_periods: tuple[int, ...]

pre_trend_f_statistic instance-attribute

pre_trend_f_statistic: float | None

pre_trend_p_value instance-attribute

pre_trend_p_value: float | None

reference_period instance-attribute

reference_period: int

GroupTimeATT dataclass

A single group-time average treatment effect.

Source code in src/nyc311/stats/_staggered_did.py
24
25
26
27
28
29
30
31
32
@dataclass(frozen=True, slots=True)
class GroupTimeATT:
    """A single group-time average treatment effect."""

    group: str
    period: str
    att: float
    se: float
    p_value: float

group instance-attribute

group: str

period instance-attribute

period: str

att instance-attribute

att: float

se instance-attribute

se: float

p_value instance-attribute

p_value: float

StaggeredDiDResult dataclass

Result of a staggered difference-in-differences estimation.

Source code in src/nyc311/stats/_staggered_did.py
35
36
37
38
39
40
41
42
43
44
45
46
47
@dataclass(frozen=True, slots=True)
class StaggeredDiDResult:
    """Result of a staggered difference-in-differences estimation."""

    group_time_atts: tuple[GroupTimeATT, ...]
    aggregated_att: float
    aggregated_se: float
    aggregated_p_value: float
    aggregated_ci_lower: float
    aggregated_ci_upper: float
    n_groups: int
    n_periods: int
    model_summary: str

group_time_atts instance-attribute

group_time_atts: tuple[GroupTimeATT, ...]

aggregated_att instance-attribute

aggregated_att: float

aggregated_se instance-attribute

aggregated_se: float

aggregated_p_value instance-attribute

aggregated_p_value: float

aggregated_ci_lower instance-attribute

aggregated_ci_lower: float

aggregated_ci_upper instance-attribute

aggregated_ci_upper: float

n_groups instance-attribute

n_groups: int

n_periods instance-attribute

n_periods: int

model_summary instance-attribute

model_summary: str

SyntheticControlResult dataclass

Result of a synthetic control analysis.

Source code in src/nyc311/stats/_synthetic_control.py
24
25
26
27
28
29
30
31
32
33
34
35
36
37
@dataclass(frozen=True, slots=True)
class SyntheticControlResult:
    """Result of a synthetic control analysis."""

    treated_unit: str
    donor_weights: dict[str, float]
    counterfactual: tuple[float, ...]
    observed: tuple[float, ...]
    treatment_effect: tuple[float, ...]
    att: float
    periods: tuple[str, ...]
    pre_treatment_mspe: float
    placebo_p_value: float | None
    model_summary: str

treated_unit instance-attribute

treated_unit: str

donor_weights instance-attribute

donor_weights: dict[str, float]

counterfactual instance-attribute

counterfactual: tuple[float, ...]

observed instance-attribute

observed: tuple[float, ...]

treatment_effect instance-attribute

treatment_effect: tuple[float, ...]

att instance-attribute

att: float

periods instance-attribute

periods: tuple[str, ...]

pre_treatment_mspe instance-attribute

pre_treatment_mspe: float

placebo_p_value instance-attribute

placebo_p_value: float | None

model_summary instance-attribute

model_summary: str

detect_stl_anomalies

detect_stl_anomalies(
    series: Any,
    *,
    period: int | None = None,
    threshold: float = 2.0,
) -> STLAnomalyResult

Detect anomalies using STL decomposition residuals.

Decomposes series via STL and flags observations whose absolute residual z-score exceeds threshold.

Parameters:

Name Type Description Default
series Any

A pandas.Series indexed by a DatetimeIndex.

required
period int | None

Seasonal period in observations. When None, the period is inferred from the index frequency.

None
threshold float

Absolute z-score threshold above which an observation is flagged as anomalous. Defaults to 2.0.

2.0

Returns:

Name Type Description
An STLAnomalyResult

class:STLAnomalyResult with the anomaly dates, their

STLAnomalyResult

z-scores, and summary statistics of the residual distribution.

Raises:

Type Description
ImportError

If statsmodels or pandas is not installed. Install with pip install nyc311[stats].

Source code in src/nyc311/stats/_anomaly.py
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
def detect_stl_anomalies(
    series: Any,
    *,
    period: int | None = None,
    threshold: float = 2.0,
) -> STLAnomalyResult:
    """Detect anomalies using STL decomposition residuals.

    Decomposes ``series`` via STL and flags observations whose
    absolute residual z-score exceeds ``threshold``.

    Args:
        series: A ``pandas.Series`` indexed by a ``DatetimeIndex``.
        period: Seasonal period in observations.  When ``None``, the
            period is inferred from the index frequency.
        threshold: Absolute z-score threshold above which an
            observation is flagged as anomalous.  Defaults to ``2.0``.

    Returns:
        An :class:`STLAnomalyResult` with the anomaly dates, their
        z-scores, and summary statistics of the residual distribution.

    Raises:
        ImportError: If statsmodels or pandas is not installed.
            Install with ``pip install nyc311[stats]``.
    """
    try:
        import numpy as np
    except ImportError as exc:
        msg = "numpy is required for detect_stl_anomalies(). Install with: pip install nyc311[stats]"
        raise ImportError(msg) from exc

    from nyc311.stats._decomposition import seasonal_decompose

    decomp = seasonal_decompose(series, period=period)
    residual = decomp.residual.dropna()

    resid_values = np.asarray(residual.values, dtype=float)
    mu = float(np.mean(resid_values))
    sigma = float(np.std(resid_values, ddof=1)) if len(resid_values) > 1 else 0.0

    if sigma == 0.0:
        return STLAnomalyResult(
            anomaly_dates=(),
            anomaly_scores=(),
            threshold=threshold,
            n_anomalies=0,
            residual_mean=mu,
            residual_std=0.0,
        )

    z_scores = (resid_values - mu) / sigma
    mask = np.abs(z_scores) > threshold

    anomaly_dates = tuple(residual.index[mask])
    anomaly_scores = tuple(float(z) for z in z_scores[mask])

    return STLAnomalyResult(
        anomaly_dates=anomaly_dates,
        anomaly_scores=anomaly_scores,
        threshold=threshold,
        n_anomalies=int(mask.sum()),
        residual_mean=mu,
        residual_std=sigma,
    )

bym2_smooth

bym2_smooth(
    observed_counts: dict[str, int],
    expected_counts: dict[str, float],
    adjacency: dict[str, tuple[str, ...]],
    *,
    n_samples: int = 2000,
    n_tune: int = 1000,
    random_seed: int = 42,
) -> BYM2Result

Smooth area-level rates with the BYM2 model.

Estimates: y_i ~ Poisson(E_i * exp(mu + phi_i))

where phi_i = sqrt(rho) * spatial_i + sqrt(1 - rho) * iid_i

The mixing parameter rho controls the balance between spatially structured and unstructured random effects.

Parameters:

Name Type Description Default
observed_counts dict[str, int]

Mapping {unit_id: observed_count}.

required
expected_counts dict[str, float]

Mapping {unit_id: expected_count}.

required
adjacency dict[str, tuple[str, ...]]

Mapping {unit_id: (neighbor_ids,...)}.

required
n_samples int

Number of posterior draws after tuning.

2000
n_tune int

Number of warmup / tuning iterations.

1000
random_seed int

Random seed for reproducibility.

42

Returns:

Name Type Description
A BYM2Result

class:BYM2Result with smoothed rates, 95% credible

BYM2Result

intervals, and variance decomposition.

Raises:

Type Description
ImportError

If pymc is not installed.

Source code in src/nyc311/stats/_bym2.py
 38
 39
 40
 41
 42
 43
 44
 45
 46
 47
 48
 49
 50
 51
 52
 53
 54
 55
 56
 57
 58
 59
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
def bym2_smooth(
    observed_counts: dict[str, int],
    expected_counts: dict[str, float],
    adjacency: dict[str, tuple[str, ...]],
    *,
    n_samples: int = 2000,
    n_tune: int = 1000,
    random_seed: int = 42,
) -> BYM2Result:
    """Smooth area-level rates with the BYM2 model.

    Estimates: y_i ~ Poisson(E_i * exp(mu + phi_i))

    where phi_i = sqrt(rho) * spatial_i + sqrt(1 - rho) * iid_i

    The mixing parameter rho controls the balance between spatially
    structured and unstructured random effects.

    Args:
        observed_counts: Mapping ``{unit_id: observed_count}``.
        expected_counts: Mapping ``{unit_id: expected_count}``.
        adjacency: Mapping ``{unit_id: (neighbor_ids,...)}``.
        n_samples: Number of posterior draws after tuning.
        n_tune: Number of warmup / tuning iterations.
        random_seed: Random seed for reproducibility.

    Returns:
        A :class:`BYM2Result` with smoothed rates, 95% credible
        intervals, and variance decomposition.

    Raises:
        ImportError: If pymc is not installed.
    """
    try:
        import numpy as np
        import pymc as pm
    except ImportError as exc:
        msg = (
            "pymc is required for bym2_smooth(). "
            "Install with: pip install nyc311[bayes]"
        )
        raise ImportError(msg) from exc

    unit_ids = sorted(observed_counts)
    n = len(unit_ids)
    uid_to_idx = {uid: i for i, uid in enumerate(unit_ids)}

    y = np.array([observed_counts[uid] for uid in unit_ids], dtype=float)
    e = np.array([expected_counts[uid] for uid in unit_ids], dtype=float)

    adj_pairs: list[tuple[int, int]] = []
    for uid in unit_ids:
        for nb in adjacency.get(uid, ()):
            if nb in uid_to_idx:
                i, j = uid_to_idx[uid], uid_to_idx[nb]
                if i < j:
                    adj_pairs.append((i, j))

    node1 = np.array([p[0] for p in adj_pairs])
    node2 = np.array([p[1] for p in adj_pairs])

    with pm.Model() as _model:
        mu = pm.Normal("mu", mu=0, sigma=1)
        sigma = pm.HalfNormal("sigma", sigma=1)
        rho = pm.Beta("rho", alpha=1, beta=1)

        theta = pm.Normal("theta", mu=0, sigma=1, shape=n)
        phi = pm.ICAR("phi", W=_build_adjacency_matrix(n, node1, node2))

        psi = pm.Deterministic(
            "psi",
            mu + sigma * (pm.math.sqrt(rho) * phi + pm.math.sqrt(1 - rho) * theta),
        )
        rate = pm.Deterministic("rate", pm.math.exp(psi))

        pm.Poisson("obs", mu=e * rate, observed=y)

        trace = pm.sample(
            draws=n_samples,
            tune=n_tune,
            random_seed=random_seed,
            progressbar=False,
            return_inferencedata=True,
        )

    rate_samples = trace.posterior["rate"].values.reshape(-1, n)
    smoothed = rate_samples.mean(axis=0)
    lower = np.percentile(rate_samples, 2.5, axis=0)
    upper = np.percentile(rate_samples, 97.5, axis=0)

    rho_samples = trace.posterior["rho"].values.flatten()
    sigma_samples = trace.posterior["sigma"].values.flatten()
    mixing = float(np.mean(rho_samples))
    total_var = float(np.mean(sigma_samples**2))
    spatial_var = mixing * total_var
    iid_var = (1 - mixing) * total_var

    summary = (
        f"BYM2: {n} areas, {len(adj_pairs)} edges\n"
        f"Mixing (rho): {mixing:.3f}\n"
        f"Total variance (sigma^2): {total_var:.4f}\n"
        f"Spatial / IID: {spatial_var:.4f} / {iid_var:.4f}"
    )

    return BYM2Result(
        smoothed_rates={uid: float(smoothed[i]) for i, uid in enumerate(unit_ids)},
        credible_lower={uid: float(lower[i]) for i, uid in enumerate(unit_ids)},
        credible_upper={uid: float(upper[i]) for i, uid in enumerate(unit_ids)},
        mixing_parameter=mixing,
        spatial_variance=spatial_var,
        iid_variance=iid_var,
        unit_ids=tuple(unit_ids),
        n_samples=n_samples,
        model_summary=summary,
    )

detect_changepoints

detect_changepoints(
    series: Any,
    *,
    method: Literal["pelt", "binseg"] = "pelt",
    penalty: float | None = None,
    min_segment_size: int = 5,
) -> ChangepointResult

Detect structural breaks in a complaint time series.

Parameters:

Name Type Description Default
series Any

A pandas.Series indexed by a DatetimeIndex.

required
method Literal['pelt', 'binseg']

Detection algorithm; one of "pelt" (default, optimal) or "binseg" (binary segmentation, faster but approximate).

'pelt'
penalty float | None

Penalty value passed to the underlying ruptures algorithm. When None, defaults to log(n) * variance, a BIC-like heuristic.

None
min_segment_size int

Minimum number of observations between consecutive changepoints.

5

Returns:

Name Type Description
A ChangepointResult

class:ChangepointResult containing the integer breakpoint

ChangepointResult

indices, their corresponding dates, the resulting segment count,

ChangepointResult

and the penalty actually used.

Raises:

Type Description
ImportError

If ruptures or pandas is not installed. Install the optional stats extra with pip install nyc311[stats].

TypeError

If series does not use a DatetimeIndex.

Source code in src/nyc311/stats/_changepoint.py
 35
 36
 37
 38
 39
 40
 41
 42
 43
 44
 45
 46
 47
 48
 49
 50
 51
 52
 53
 54
 55
 56
 57
 58
 59
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
def detect_changepoints(
    series: Any,
    *,
    method: Literal["pelt", "binseg"] = "pelt",
    penalty: float | None = None,
    min_segment_size: int = 5,
) -> ChangepointResult:
    """Detect structural breaks in a complaint time series.

    Args:
        series: A ``pandas.Series`` indexed by a ``DatetimeIndex``.
        method: Detection algorithm; one of ``"pelt"`` (default,
            optimal) or ``"binseg"`` (binary segmentation, faster but
            approximate).
        penalty: Penalty value passed to the underlying ``ruptures``
            algorithm. When ``None``, defaults to ``log(n) * variance``,
            a BIC-like heuristic.
        min_segment_size: Minimum number of observations between
            consecutive changepoints.

    Returns:
        A :class:`ChangepointResult` containing the integer breakpoint
        indices, their corresponding dates, the resulting segment count,
        and the penalty actually used.

    Raises:
        ImportError: If ``ruptures`` or pandas is not installed. Install
            the optional stats extra with ``pip install nyc311[stats]``.
        TypeError: If ``series`` does not use a ``DatetimeIndex``.
    """
    try:
        import numpy as np
        import pandas as pd
        import ruptures as rpt
    except ImportError as exc:
        message = (
            "ruptures and pandas are required for detect_changepoints(). "
            "Install with: pip install nyc311[stats]"
        )
        raise ImportError(message) from exc

    if not isinstance(series.index, pd.DatetimeIndex):
        msg = "series must have a DatetimeIndex."
        raise TypeError(msg)

    signal = series.dropna().to_numpy().astype(float)
    n = len(signal)

    if penalty is None:
        penalty = float(np.log(n) * np.var(signal)) if n > 1 else 1.0

    if method == "pelt":
        algo = rpt.Pelt(model="l2", min_size=min_segment_size).fit(signal)
    else:
        algo = rpt.Binseg(model="l2", min_size=min_segment_size).fit(signal)

    raw_breaks: list[int] = algo.predict(pen=penalty)
    # ruptures returns the *end* of each segment; the last element equals n
    breakpoint_indices = [b for b in raw_breaks if b < n]

    dates_index = series.dropna().index
    breakpoint_dates: list[date] = []
    for idx in breakpoint_indices:
        ts = dates_index[idx]
        breakpoint_dates.append(ts.date() if hasattr(ts, "date") else ts)

    return ChangepointResult(
        breakpoints=tuple(breakpoint_indices),
        breakpoint_dates=tuple(breakpoint_dates),
        n_segments=len(breakpoint_indices) + 1,
        penalty=penalty,
    )

seasonal_decompose

seasonal_decompose(
    series: Any, *, period: int | None = None
) -> DecompositionResult

Decompose series into trend, seasonal, and residual components.

Wraps :class:statsmodels.tsa.seasonal.STL. The series must be indexed by a DatetimeIndex.

Parameters:

Name Type Description Default
series Any

A pandas.Series indexed by a DatetimeIndex.

required
period int | None

Seasonal period in observations. When None, the period is inferred from the index frequency (monthly → 12, weekly → 52, daily → 7, quarterly → 4, yearly → 1).

None

Returns:

Name Type Description
A DecompositionResult

class:DecompositionResult exposing the trend, seasonal, and

DecompositionResult

residual pandas.Series plus the period actually used.

Raises:

Type Description
ImportError

If statsmodels or pandas is not installed. Install the optional stats extra with pip install nyc311[stats].

TypeError

If series does not use a DatetimeIndex.

Source code in src/nyc311/stats/_decomposition.py
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
def seasonal_decompose(
    series: Any,
    *,
    period: int | None = None,
) -> DecompositionResult:
    """Decompose ``series`` into trend, seasonal, and residual components.

    Wraps :class:`statsmodels.tsa.seasonal.STL`. The series must be
    indexed by a ``DatetimeIndex``.

    Args:
        series: A ``pandas.Series`` indexed by a ``DatetimeIndex``.
        period: Seasonal period in observations. When ``None``, the
            period is inferred from the index frequency (monthly → 12,
            weekly → 52, daily → 7, quarterly → 4, yearly → 1).

    Returns:
        A :class:`DecompositionResult` exposing the trend, seasonal, and
        residual ``pandas.Series`` plus the period actually used.

    Raises:
        ImportError: If statsmodels or pandas is not installed. Install
            the optional stats extra with ``pip install nyc311[stats]``.
        TypeError: If ``series`` does not use a ``DatetimeIndex``.
    """
    try:
        import pandas as pd
        from statsmodels.tsa.seasonal import STL
    except ImportError as exc:
        message = (
            "statsmodels and pandas are required for seasonal_decompose(). "
            "Install with: pip install nyc311[stats]"
        )
        raise ImportError(message) from exc

    if not isinstance(series.index, pd.DatetimeIndex):
        msg = "series must have a DatetimeIndex."
        raise TypeError(msg)

    if period is None:
        freq = pd.infer_freq(series.index)
        period = _infer_period(freq)

    result = STL(series.dropna(), period=period).fit()
    return DecompositionResult(
        trend=result.trend,
        seasonal=result.seasonal,
        residual=result.resid,
        period=period,
    )

oaxaca_blinder_decomposition

oaxaca_blinder_decomposition(
    group_a: Any,
    group_b: Any,
    outcome: str,
    regressors: tuple[str, ...],
) -> OaxacaBlinderResult

Decompose the mean-outcome gap between two groups.

Uses the Oaxaca-Blinder twofold decomposition with group B coefficients as the reference:

gap = (mean(X_a) - mean(X_b)) @ beta_b  [explained]
    + mean(X_a) @ (beta_a - beta_b)      [unexplained]

Parameters:

Name Type Description Default
group_a Any

pandas.DataFrame for the first group.

required
group_b Any

pandas.DataFrame for the second group.

required
outcome str

Name of the outcome column.

required
regressors tuple[str, ...]

Column names to include as explanatory variables.

required

Returns:

Name Type Description
An OaxacaBlinderResult

class:OaxacaBlinderResult with the total gap, explained

OaxacaBlinderResult

and unexplained components, and per-variable contributions.

Raises:

Type Description
ImportError

If numpy or pandas is not installed.

ValueError

If fewer than 2 observations exist in either group.

Source code in src/nyc311/stats/_equity.py
 57
 58
 59
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
def oaxaca_blinder_decomposition(
    group_a: Any,
    group_b: Any,
    outcome: str,
    regressors: tuple[str, ...],
) -> OaxacaBlinderResult:
    """Decompose the mean-outcome gap between two groups.

    Uses the Oaxaca-Blinder twofold decomposition with group B
    coefficients as the reference:

        gap = (mean(X_a) - mean(X_b)) @ beta_b  [explained]
            + mean(X_a) @ (beta_a - beta_b)      [unexplained]

    Args:
        group_a: ``pandas.DataFrame`` for the first group.
        group_b: ``pandas.DataFrame`` for the second group.
        outcome: Name of the outcome column.
        regressors: Column names to include as explanatory variables.

    Returns:
        An :class:`OaxacaBlinderResult` with the total gap, explained
        and unexplained components, and per-variable contributions.

    Raises:
        ImportError: If numpy or pandas is not installed.
        ValueError: If fewer than 2 observations exist in either group.
    """
    try:
        import numpy as np
    except ImportError as exc:
        msg = "numpy is required for oaxaca_blinder_decomposition(). Install with: pip install nyc311[stats]"
        raise ImportError(msg) from exc

    ya = np.asarray(group_a[outcome].values, dtype=float)
    yb = np.asarray(group_b[outcome].values, dtype=float)

    if len(ya) < 2 or len(yb) < 2:
        msg = "Each group must have at least 2 observations."
        raise ValueError(msg)

    xa = np.column_stack(
        [np.asarray(group_a[r].values, dtype=float) for r in regressors]
    )
    xb = np.column_stack(
        [np.asarray(group_b[r].values, dtype=float) for r in regressors]
    )

    xa_with_const = np.column_stack([np.ones(len(xa)), xa])
    xb_with_const = np.column_stack([np.ones(len(xb)), xb])

    beta_a = np.linalg.lstsq(xa_with_const, ya, rcond=None)[0]
    beta_b = np.linalg.lstsq(xb_with_const, yb, rcond=None)[0]

    mean_xa = xa.mean(axis=0)
    mean_xb = xb.mean(axis=0)

    mean_a = float(ya.mean())
    mean_b = float(yb.mean())
    total_gap = mean_a - mean_b

    explained_components = (mean_xa - mean_xb) * beta_b[1:]
    explained = float(explained_components.sum())
    mean_xa_with_const = np.concatenate([[1], mean_xa])
    unexplained = float(mean_xa_with_const @ (beta_a - beta_b))

    contributions = {
        name: float(explained_components[i]) for i, name in enumerate(regressors)
    }

    return OaxacaBlinderResult(
        mean_group_a=mean_a,
        mean_group_b=mean_b,
        total_gap=total_gap,
        explained=explained,
        unexplained=unexplained,
        component_contributions=contributions,
        n_group_a=len(ya),
        n_group_b=len(yb),
    )

theil_index

theil_index(
    values: dict[str, float],
    populations: dict[str, int],
    *,
    groups: dict[str, str] | None = None,
) -> TheilResult

Compute the population-weighted Theil T index.

When groups is provided, decomposes the total index into between-group and within-group components.

Parameters:

Name Type Description Default
values dict[str, float]

Mapping {unit_id: value} of the variable to measure inequality over (e.g. complaint rate).

required
populations dict[str, int]

Mapping {unit_id: population} for weighting.

required
groups dict[str, str] | None

Optional mapping {unit_id: group_label} for decomposition. When None, between-group and within-group are both set to 0.0.

None

Returns:

Name Type Description
A TheilResult

class:TheilResult with the total index, between/within

TheilResult

components, per-unit contributions, and count.

Raises:

Type Description
ImportError

If numpy is not installed.

ValueError

If values and populations have mismatched keys.

Source code in src/nyc311/stats/_equity.py
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
def theil_index(
    values: dict[str, float],
    populations: dict[str, int],
    *,
    groups: dict[str, str] | None = None,
) -> TheilResult:
    """Compute the population-weighted Theil T index.

    When ``groups`` is provided, decomposes the total index into
    between-group and within-group components.

    Args:
        values: Mapping ``{unit_id: value}`` of the variable to
            measure inequality over (e.g. complaint rate).
        populations: Mapping ``{unit_id: population}`` for weighting.
        groups: Optional mapping ``{unit_id: group_label}`` for
            decomposition. When ``None``, between-group and
            within-group are both set to ``0.0``.

    Returns:
        A :class:`TheilResult` with the total index, between/within
        components, per-unit contributions, and count.

    Raises:
        ImportError: If numpy is not installed.
        ValueError: If values and populations have mismatched keys.
    """
    try:
        import numpy as np
    except ImportError as exc:
        msg = "numpy is required for theil_index(). Install with: pip install nyc311[stats]"
        raise ImportError(msg) from exc

    unit_ids = sorted(values)
    if set(unit_ids) != set(populations):
        msg = "values and populations must have the same keys."
        raise ValueError(msg)

    v = np.array([values[uid] for uid in unit_ids], dtype=float)
    p = np.array([populations[uid] for uid in unit_ids], dtype=float)

    total_pop = p.sum()
    total_value = (v * p).sum()

    if total_value <= 0 or total_pop <= 0:
        return TheilResult(
            total=0.0,
            between_group=0.0,
            within_group=0.0,
            unit_contributions=dict.fromkeys(unit_ids, 0.0),
            n_units=len(unit_ids),
        )

    mu = total_value / total_pop
    shares = (v * p) / total_value

    with np.errstate(divide="ignore", invalid="ignore"):
        log_ratios = np.where(v > 0, np.log(v / mu), 0.0)

    contributions_arr = shares * log_ratios
    total_t = float(np.sum(contributions_arr))

    unit_contributions = {
        uid: float(contributions_arr[i]) for i, uid in enumerate(unit_ids)
    }

    between = 0.0
    within = 0.0
    if groups is not None:
        group_labels = sorted(set(groups.values()))
        for g in group_labels:
            member_mask = np.array([groups.get(uid) == g for uid in unit_ids])
            g_pop = p[member_mask].sum()
            g_value = (v[member_mask] * p[member_mask]).sum()
            if g_pop <= 0 or g_value <= 0:
                continue
            g_mu = g_value / g_pop
            g_share = g_value / total_value
            between += g_share * float(np.log(g_mu / mu))

            g_v = v[member_mask]
            g_p = p[member_mask]
            g_shares = (g_v * g_p) / g_value
            with np.errstate(divide="ignore", invalid="ignore"):
                g_log = np.where(g_v > 0, np.log(g_v / g_mu), 0.0)
            within += g_share * float(np.sum(g_shares * g_log))

    return TheilResult(
        total=total_t,
        between_group=between,
        within_group=within,
        unit_contributions=unit_contributions,
        n_units=len(unit_ids),
    )

geographically_weighted_regression

geographically_weighted_regression(
    values: dict[str, float],
    regressors: dict[str, dict[str, float]],
    coordinates: dict[str, tuple[float, float]],
    *,
    bandwidth: float | None = None,
    kernel: str = "bisquare",
) -> GWRResult

Fit a geographically weighted regression.

Estimates locally varying coefficients, allowing the relationship between outcome and regressors to change across space.

Parameters:

Name Type Description Default
values dict[str, float]

Mapping {unit_id: outcome_value}.

required
regressors dict[str, dict[str, float]]

Mapping {unit_id: {variable_name: value}}.

required
coordinates dict[str, tuple[float, float]]

Mapping {unit_id: (latitude, longitude)}.

required
bandwidth float | None

Fixed bandwidth. When None, an optimal bandwidth is selected via cross-validation.

None
kernel str

Kernel function. One of "bisquare" (default), "gaussian", or "exponential".

'bisquare'

Returns:

Name Type Description
A GWRResult

class:GWRResult with local coefficients per unit,

GWRResult

local R-squared values, bandwidth, and fit statistics.

Raises:

Type Description
ImportError

If mgwr is not installed.

ValueError

If fewer than 5 observations are provided.

Source code in src/nyc311/stats/_gwr.py
 37
 38
 39
 40
 41
 42
 43
 44
 45
 46
 47
 48
 49
 50
 51
 52
 53
 54
 55
 56
 57
 58
 59
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
def geographically_weighted_regression(
    values: dict[str, float],
    regressors: dict[str, dict[str, float]],
    coordinates: dict[str, tuple[float, float]],
    *,
    bandwidth: float | None = None,
    kernel: str = "bisquare",
) -> GWRResult:
    """Fit a geographically weighted regression.

    Estimates locally varying coefficients, allowing the relationship
    between outcome and regressors to change across space.

    Args:
        values: Mapping ``{unit_id: outcome_value}``.
        regressors: Mapping
            ``{unit_id: {variable_name: value}}``.
        coordinates: Mapping ``{unit_id: (latitude, longitude)}``.
        bandwidth: Fixed bandwidth.  When ``None``, an optimal
            bandwidth is selected via cross-validation.
        kernel: Kernel function.  One of ``"bisquare"`` (default),
            ``"gaussian"``, or ``"exponential"``.

    Returns:
        A :class:`GWRResult` with local coefficients per unit,
        local R-squared values, bandwidth, and fit statistics.

    Raises:
        ImportError: If mgwr is not installed.
        ValueError: If fewer than 5 observations are provided.
    """
    try:
        import numpy as np
        from scipy.spatial.distance import cdist
    except ImportError as exc:
        msg = (
            "numpy and scipy are required for "
            "geographically_weighted_regression(). "
            "Install with: pip install nyc311[spatial-regression]"
        )
        raise ImportError(msg) from exc

    unit_ids = sorted(values)
    if len(unit_ids) < 5:
        msg = "GWR requires at least 5 observations."
        raise ValueError(msg)

    var_names = sorted(next(iter(regressors.values())).keys())
    y = np.array([values[uid] for uid in unit_ids], dtype=float)
    x_raw = np.column_stack(
        [
            np.array([regressors[uid][v] for uid in unit_ids], dtype=float)
            for v in var_names
        ]
    )
    x = np.column_stack([np.ones(len(unit_ids)), x_raw])
    coords = np.array([coordinates[uid] for uid in unit_ids], dtype=float)

    dists = cdist(coords, coords)

    if bandwidth is None:
        bandwidth = _cv_bandwidth(y, x, dists, kernel)

    all_names = ["CONSTANT", *var_names]
    n = len(unit_ids)
    k = x.shape[1]
    local_betas = np.zeros((n, k))
    local_r2 = np.zeros(n)
    y_hat_global = np.zeros(n)

    for i in range(n):
        w_i = _kernel_weights(dists[i], bandwidth, kernel)
        w_diag = np.diag(w_i)
        xtwx = x.T @ w_diag @ x
        xtwy = x.T @ w_diag @ y
        try:
            beta_i = np.linalg.solve(xtwx, xtwy)
        except np.linalg.LinAlgError:
            beta_i = np.linalg.lstsq(xtwx, xtwy, rcond=None)[0]
        local_betas[i] = beta_i
        y_hat_i = x[i] @ beta_i
        y_hat_global[i] = y_hat_i

        ss_tot = float(np.sum(w_i * (y - np.average(y, weights=w_i)) ** 2))
        ss_res = float(np.sum(w_i * (y - x @ beta_i) ** 2))
        local_r2[i] = 1.0 - ss_res / ss_tot if ss_tot > 0 else 0.0

    ss_tot_global = float(np.sum((y - np.mean(y)) ** 2))
    ss_res_global = float(np.sum((y - y_hat_global) ** 2))
    global_r2 = 1.0 - ss_res_global / ss_tot_global if ss_tot_global > 0 else 0.0

    aic_val = n * np.log(ss_res_global / n) + 2 * k

    local_coefficients = {
        name: tuple(float(local_betas[i, j]) for i in range(n))
        for j, name in enumerate(all_names)
    }

    summary = (
        f"GWR: {n} observations, {k} parameters\n"
        f"Bandwidth: {bandwidth:.4f}, Kernel: {kernel}\n"
        f"Global R-squared: {global_r2:.4f}, AIC: {aic_val:.2f}"
    )

    return GWRResult(
        local_coefficients=local_coefficients,
        local_r_squared=tuple(float(r) for r in local_r2),
        bandwidth=float(bandwidth),
        aic=float(aic_val),
        unit_ids=tuple(unit_ids),
        global_r_squared=float(global_r2),
        n_observations=n,
        model_summary=summary,
    )

fit_hawkes_process

fit_hawkes_process(
    event_times: Any,
    *,
    kernel: str = "exponential",
    max_iter: int = 1000,
) -> HawkesResult

Fit a univariate Hawkes process to event timestamps.

The conditional intensity is:

lambda(t) = mu + sum_{t_i < t} alpha * beta * exp(-beta * (t - t_i))

Parameters:

Name Type Description Default
event_times Any

Array-like of event timestamps as floats (e.g. seconds since epoch, or days since start).

required
kernel str

Triggering kernel type. Currently only "exponential" is supported.

'exponential'
max_iter int

Maximum iterations for the EM algorithm.

1000

Returns:

Name Type Description
A HawkesResult

class:HawkesResult with background rate, triggering

HawkesResult

kernel parameters, branching ratio, and log-likelihood.

Raises:

Type Description
ImportError

If numpy or scipy is not installed.

ValueError

If fewer than 3 events are provided.

Source code in src/nyc311/stats/_hawkes.py
 43
 44
 45
 46
 47
 48
 49
 50
 51
 52
 53
 54
 55
 56
 57
 58
 59
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
def fit_hawkes_process(
    event_times: Any,
    *,
    kernel: str = "exponential",
    max_iter: int = 1000,
) -> HawkesResult:
    """Fit a univariate Hawkes process to event timestamps.

    The conditional intensity is:

        lambda(t) = mu + sum_{t_i < t} alpha * beta * exp(-beta * (t - t_i))

    Args:
        event_times: Array-like of event timestamps as floats
            (e.g. seconds since epoch, or days since start).
        kernel: Triggering kernel type. Currently only
            ``"exponential"`` is supported.
        max_iter: Maximum iterations for the EM algorithm.

    Returns:
        A :class:`HawkesResult` with background rate, triggering
        kernel parameters, branching ratio, and log-likelihood.

    Raises:
        ImportError: If numpy or scipy is not installed.
        ValueError: If fewer than 3 events are provided.
    """
    if kernel != "exponential":
        msg = f"Only 'exponential' kernel is supported, got {kernel!r}"
        raise ValueError(msg)

    try:
        import numpy as np
    except ImportError as exc:
        msg = "numpy is required for fit_hawkes_process(). Install with: pip install nyc311[stats]"
        raise ImportError(msg) from exc

    times = np.sort(np.asarray(event_times, dtype=float))
    n = len(times)

    if n < 3:
        msg = "Need at least 3 events to fit a Hawkes process."
        raise ValueError(msg)

    t_max = times[-1] - times[0]
    times = times - times[0]

    mu = n / (2.0 * t_max)
    alpha = 0.1
    beta_param = 1.0

    for _ in range(max_iter):
        intensities = np.full(n, mu)
        for i in range(1, n):
            dt = times[i] - times[:i]
            intensities[i] += alpha * beta_param * np.sum(np.exp(-beta_param * dt))

        p = np.zeros((n, n))
        for i in range(1, n):
            dt = times[i] - times[:i]
            trigger = alpha * beta_param * np.exp(-beta_param * dt)
            total = mu + np.sum(trigger)
            if total > 0:
                p[i, :i] = trigger / total

        n_background = sum(
            mu
            / (
                mu
                + alpha
                * beta_param
                * np.sum(np.exp(-beta_param * (times[i] - times[:i])))
            )
            if i > 0
            else 1.0
            for i in range(n)
        )

        mu_new = n_background / t_max

        n_triggered = n - n_background
        alpha_new = n_triggered / n if n > 0 else 0.0

        if n_triggered > 0:
            weighted_dt_sum = 0.0
            for i in range(1, n):
                dt = times[i] - times[:i]
                weights = p[i, :i]
                weighted_dt_sum += np.sum(weights * dt)
            beta_new = (
                n_triggered / weighted_dt_sum if weighted_dt_sum > 0 else beta_param
            )
        else:
            beta_new = beta_param

        if (
            abs(mu_new - mu) < 1e-8
            and abs(alpha_new - alpha) < 1e-8
            and abs(beta_new - beta_param) < 1e-8
        ):
            mu, alpha, beta_param = mu_new, alpha_new, beta_new
            break

        mu, alpha, beta_param = mu_new, alpha_new, beta_new

    ll = 0.0
    for i in range(n):
        lam_i = mu
        if i > 0:
            dt = times[i] - times[:i]
            lam_i += alpha * beta_param * float(np.sum(np.exp(-beta_param * dt)))
        ll += np.log(max(lam_i, 1e-10))
    ll -= mu * t_max
    for i in range(n):
        ll += alpha * (np.exp(-beta_param * (t_max - times[i])) - 1.0)

    branching = alpha / beta_param if beta_param > 0 else float("inf")

    summary = (
        f"Hawkes Process: {n} events over {t_max:.1f} time units\n"
        f"Background rate (mu): {mu:.4f}\n"
        f"Triggering: alpha={alpha:.4f}, beta={beta_param:.4f}\n"
        f"Branching ratio: {branching:.4f}\n"
        f"Log-likelihood: {ll:.2f}"
    )

    return HawkesResult(
        background_rate=float(mu),
        triggering_kernel_alpha=float(alpha),
        triggering_kernel_beta=float(beta_param),
        branching_ratio=float(branching),
        n_events=n,
        log_likelihood=float(ll),
        model_summary=summary,
    )

interrupted_time_series

interrupted_time_series(
    series: Any,
    intervention_date: date,
    *,
    covariates: Any | None = None,
) -> ITSResult

Fit a segmented interrupted-time-series regression.

Estimates pre-intervention level and trend, the immediate level change at intervention_date, and the post-intervention trend change, following the standard ITS regression specification.

Parameters:

Name Type Description Default
series Any

A pandas.Series indexed by a DatetimeIndex containing the outcome to model.

required
intervention_date date

The date the intervention began. Observations on or after this date are treated as post-intervention.

required
covariates Any | None

Optional pandas.DataFrame of exogenous regressors aligned to series. Each column is added to the design matrix.

None

Returns:

Name Type Description
An ITSResult

class:ITSResult with pre/post trends, the level and trend

ITSResult

changes at intervention_date, p-values for the level and

ITSResult

trend coefficients, and the full model summary string.

Raises:

Type Description
ImportError

If statsmodels or pandas is not installed. Install the optional stats extra with pip install nyc311[stats].

TypeError

If series does not use a DatetimeIndex.

Source code in src/nyc311/stats/_its.py
 31
 32
 33
 34
 35
 36
 37
 38
 39
 40
 41
 42
 43
 44
 45
 46
 47
 48
 49
 50
 51
 52
 53
 54
 55
 56
 57
 58
 59
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
def interrupted_time_series(
    series: Any,
    intervention_date: date,
    *,
    covariates: Any | None = None,
) -> ITSResult:
    """Fit a segmented interrupted-time-series regression.

    Estimates pre-intervention level and trend, the immediate level
    change at ``intervention_date``, and the post-intervention trend
    change, following the standard ITS regression specification.

    Args:
        series: A ``pandas.Series`` indexed by a ``DatetimeIndex``
            containing the outcome to model.
        intervention_date: The date the intervention began. Observations
            on or after this date are treated as post-intervention.
        covariates: Optional ``pandas.DataFrame`` of exogenous regressors
            aligned to ``series``. Each column is added to the design
            matrix.

    Returns:
        An :class:`ITSResult` with pre/post trends, the level and trend
        changes at ``intervention_date``, p-values for the level and
        trend coefficients, and the full model summary string.

    Raises:
        ImportError: If statsmodels or pandas is not installed. Install
            the optional stats extra with ``pip install nyc311[stats]``.
        TypeError: If ``series`` does not use a ``DatetimeIndex``.
    """
    try:
        import numpy as np
        import pandas as pd
        from statsmodels.regression.linear_model import OLS
        from statsmodels.tools import add_constant
    except ImportError as exc:
        message = (
            "statsmodels and pandas are required for interrupted_time_series(). "
            "Install with: pip install nyc311[stats]"
        )
        raise ImportError(message) from exc

    if not isinstance(series.index, pd.DatetimeIndex):
        msg = "series must have a DatetimeIndex."
        raise TypeError(msg)

    df = pd.DataFrame({"y": series})
    df["time"] = np.arange(len(df))
    df["intervention"] = (df.index >= pd.Timestamp(intervention_date)).astype(int)
    df["time_after"] = df["time"] * df["intervention"]

    exog_cols = ["time", "intervention", "time_after"]
    if covariates is not None:
        for col in covariates.columns:
            df[col] = covariates[col].to_numpy()
            exog_cols.append(col)

    exog = add_constant(df[exog_cols])
    model = OLS(df["y"], exog, missing="drop").fit()

    pre_trend = float(model.params["time"])
    trend_change = float(model.params["time_after"])
    post_trend = pre_trend + trend_change
    level_change = float(model.params["intervention"])
    p_level = float(model.pvalues["intervention"])
    p_trend = float(model.pvalues["time_after"])

    return ITSResult(
        pre_trend=pre_trend,
        post_trend=post_trend,
        level_change=level_change,
        trend_change=trend_change,
        p_value_level=p_level,
        p_value_trend=p_trend,
        model_summary=str(model.summary()),
    )

panel_fixed_effects

panel_fixed_effects(
    panel: PanelDataset,
    outcome: str,
    regressors: tuple[str, ...],
    *,
    time_effects: bool = False,
    cluster: Literal["entity", "time", "both"] = "entity",
) -> PanelRegressionResult

Estimate a fixed-effects panel regression.

Wraps :class:linearmodels.panel.PanelOLS with entity fixed effects by default and optional two-way fixed effects.

Parameters:

Name Type Description Default
panel PanelDataset

A :class:~nyc311.temporal.PanelDataset providing the data, entities, and periods.

required
outcome str

Name of the dependent variable column.

required
regressors tuple[str, ...]

Names of independent variable columns.

required
time_effects bool

When True, include time fixed effects in addition to entity fixed effects (two-way FE).

False
cluster Literal['entity', 'time', 'both']

Cluster standard errors by "entity" (default), "time", or "both".

'entity'

Returns:

Name Type Description
A PanelRegressionResult

class:PanelRegressionResult with coefficients, standard

PanelRegressionResult

errors, p-values, R-squared, observation counts, and the full

PanelRegressionResult

linearmodels summary string.

Raises:

Type Description
ImportError

If linearmodels or pandas is not installed. Install the optional stats extra with pip install nyc311[stats].

ValueError

If outcome or any of regressors is missing from the panel.

Source code in src/nyc311/stats/_panel_models.py
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
def panel_fixed_effects(
    panel: PanelDataset,
    outcome: str,
    regressors: tuple[str, ...],
    *,
    time_effects: bool = False,
    cluster: Literal["entity", "time", "both"] = "entity",
) -> PanelRegressionResult:
    """Estimate a fixed-effects panel regression.

    Wraps :class:`linearmodels.panel.PanelOLS` with entity fixed effects
    by default and optional two-way fixed effects.

    Args:
        panel: A :class:`~nyc311.temporal.PanelDataset` providing the
            data, entities, and periods.
        outcome: Name of the dependent variable column.
        regressors: Names of independent variable columns.
        time_effects: When ``True``, include time fixed effects in
            addition to entity fixed effects (two-way FE).
        cluster: Cluster standard errors by ``"entity"`` (default),
            ``"time"``, or ``"both"``.

    Returns:
        A :class:`PanelRegressionResult` with coefficients, standard
        errors, p-values, R-squared, observation counts, and the full
        ``linearmodels`` summary string.

    Raises:
        ImportError: If ``linearmodels`` or pandas is not installed.
            Install the optional stats extra with
            ``pip install nyc311[stats]``.
        ValueError: If ``outcome`` or any of ``regressors`` is missing
            from the panel.
    """
    try:
        from linearmodels.panel import PanelOLS
    except ImportError as exc:
        message = (
            "linearmodels is required for panel regressions. "
            "Install with: pip install nyc311[stats]"
        )
        raise ImportError(message) from exc

    df = _prepare_panel_data(panel, outcome, regressors)
    y = df[outcome]
    x = df[list(regressors)]

    cov_type_map = {
        "entity": "clustered",
        "time": "clustered",
        "both": "clustered",
    }
    cluster_entity = cluster in ("entity", "both")
    cluster_time = cluster in ("time", "both")

    model = PanelOLS(
        y,
        x,
        entity_effects=True,
        time_effects=time_effects,
    )
    result = model.fit(
        cov_type=cov_type_map[cluster],
        cluster_entity=cluster_entity,
        cluster_time=cluster_time,
    )

    return PanelRegressionResult(
        method="two_way_fe" if time_effects else "entity_fe",
        coefficients={str(k): float(v) for k, v in result.params.items()},
        std_errors={str(k): float(v) for k, v in result.std_errors.items()},
        p_values={str(k): float(v) for k, v in result.pvalues.items()},
        r_squared=float(result.rsquared),
        n_observations=int(result.nobs),
        n_entities=int(result.entity_info.total),
        n_periods=int(result.time_info.total),
        model_summary=str(result.summary),
    )

panel_random_effects

panel_random_effects(
    panel: PanelDataset,
    outcome: str,
    regressors: tuple[str, ...],
) -> PanelRegressionResult

Estimate a random-effects panel regression.

Wraps :class:linearmodels.panel.RandomEffects.

Parameters:

Name Type Description Default
panel PanelDataset

A :class:~nyc311.temporal.PanelDataset providing the data, entities, and periods.

required
outcome str

Name of the dependent variable column.

required
regressors tuple[str, ...]

Names of independent variable columns.

required

Returns:

Name Type Description
A PanelRegressionResult

class:PanelRegressionResult with coefficients, standard

PanelRegressionResult

errors, p-values, R-squared, observation counts, and the full

PanelRegressionResult

linearmodels summary string.

Raises:

Type Description
ImportError

If linearmodels or pandas is not installed. Install the optional stats extra with pip install nyc311[stats].

ValueError

If outcome or any of regressors is missing from the panel.

Source code in src/nyc311/stats/_panel_models.py
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
def panel_random_effects(
    panel: PanelDataset,
    outcome: str,
    regressors: tuple[str, ...],
) -> PanelRegressionResult:
    """Estimate a random-effects panel regression.

    Wraps :class:`linearmodels.panel.RandomEffects`.

    Args:
        panel: A :class:`~nyc311.temporal.PanelDataset` providing the
            data, entities, and periods.
        outcome: Name of the dependent variable column.
        regressors: Names of independent variable columns.

    Returns:
        A :class:`PanelRegressionResult` with coefficients, standard
        errors, p-values, R-squared, observation counts, and the full
        ``linearmodels`` summary string.

    Raises:
        ImportError: If ``linearmodels`` or pandas is not installed.
            Install the optional stats extra with
            ``pip install nyc311[stats]``.
        ValueError: If ``outcome`` or any of ``regressors`` is missing
            from the panel.
    """
    try:
        from linearmodels.panel import RandomEffects
    except ImportError as exc:
        message = (
            "linearmodels is required for panel regressions. "
            "Install with: pip install nyc311[stats]"
        )
        raise ImportError(message) from exc

    df = _prepare_panel_data(panel, outcome, regressors)
    y = df[outcome]
    x = df[list(regressors)]

    model = RandomEffects(y, x)
    result = model.fit()

    return PanelRegressionResult(
        method="random_effects",
        coefficients={str(k): float(v) for k, v in result.params.items()},
        std_errors={str(k): float(v) for k, v in result.std_errors.items()},
        p_values={str(k): float(v) for k, v in result.pvalues.items()},
        r_squared=float(result.rsquared),
        n_observations=int(result.nobs),
        n_entities=int(result.entity_info.total),
        n_periods=int(result.time_info.total),
        model_summary=str(result.summary),
    )

minimum_detectable_effect

minimum_detectable_effect(
    n_units: int,
    n_periods: int,
    *,
    icc: float = 0.05,
    alpha: float = 0.05,
    power: float = 0.8,
    proportion_treated: float = 0.5,
    outcome_variance: float = 1.0,
    r_squared: float = 0.0,
) -> PowerResult

Compute the minimum detectable effect for a panel experiment.

Uses the standard cluster-RCT MDE formula:

MDE = (z_{alpha/2} + z_{beta}) * sqrt(2 * sigma^2 * DE / (N_t * T))

where DE = 1 + (T - 1) * ICC is the design effect.

Parameters:

Name Type Description Default
n_units int

Total number of geographic units (clusters).

required
n_periods int

Number of time periods observed.

required
icc float

Intra-cluster correlation coefficient. Defaults to 0.05.

0.05
alpha float

Significance level. Defaults to 0.05.

0.05
power float

Statistical power (1 - beta). Defaults to 0.80.

0.8
proportion_treated float

Fraction of units assigned to treatment. Defaults to 0.5.

0.5
outcome_variance float

Variance of the outcome variable. Defaults to 1.0.

1.0
r_squared float

Proportion of variance explained by covariates. Defaults to 0.0 (no covariates).

0.0

Returns:

Name Type Description
A PowerResult

class:PowerResult with the computed MDE and all design

PowerResult

parameters.

Raises:

Type Description
ImportError

If scipy is not installed. Install with pip install nyc311[stats].

ValueError

If any parameter is out of its valid range.

Source code in src/nyc311/stats/_power.py
 25
 26
 27
 28
 29
 30
 31
 32
 33
 34
 35
 36
 37
 38
 39
 40
 41
 42
 43
 44
 45
 46
 47
 48
 49
 50
 51
 52
 53
 54
 55
 56
 57
 58
 59
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
def minimum_detectable_effect(
    n_units: int,
    n_periods: int,
    *,
    icc: float = 0.05,
    alpha: float = 0.05,
    power: float = 0.80,
    proportion_treated: float = 0.5,
    outcome_variance: float = 1.0,
    r_squared: float = 0.0,
) -> PowerResult:
    """Compute the minimum detectable effect for a panel experiment.

    Uses the standard cluster-RCT MDE formula:

        MDE = (z_{alpha/2} + z_{beta}) * sqrt(2 * sigma^2 * DE / (N_t * T))

    where DE = 1 + (T - 1) * ICC is the design effect.

    Args:
        n_units: Total number of geographic units (clusters).
        n_periods: Number of time periods observed.
        icc: Intra-cluster correlation coefficient.  Defaults to
            ``0.05``.
        alpha: Significance level.  Defaults to ``0.05``.
        power: Statistical power (1 - beta).  Defaults to ``0.80``.
        proportion_treated: Fraction of units assigned to treatment.
            Defaults to ``0.5``.
        outcome_variance: Variance of the outcome variable.  Defaults
            to ``1.0``.
        r_squared: Proportion of variance explained by covariates.
            Defaults to ``0.0`` (no covariates).

    Returns:
        A :class:`PowerResult` with the computed MDE and all design
        parameters.

    Raises:
        ImportError: If scipy is not installed.  Install with
            ``pip install nyc311[stats]``.
        ValueError: If any parameter is out of its valid range.
    """
    try:
        from scipy.stats import norm
    except ImportError as exc:
        msg = "scipy is required for minimum_detectable_effect(). Install with: pip install nyc311[stats]"
        raise ImportError(msg) from exc

    if n_units < 2:
        msg = "n_units must be at least 2."
        raise ValueError(msg)
    if n_periods < 1:
        msg = "n_periods must be at least 1."
        raise ValueError(msg)
    if not 0.0 < proportion_treated < 1.0:
        msg = "proportion_treated must be in (0, 1)."
        raise ValueError(msg)

    z_alpha = float(norm.ppf(1.0 - alpha / 2.0))
    z_beta = float(norm.ppf(power))

    design_effect = 1.0 + (n_periods - 1) * icc
    n_treated = n_units * proportion_treated
    adjusted_var = outcome_variance * (1.0 - r_squared)

    mde = (z_alpha + z_beta) * (
        (2.0 * adjusted_var * design_effect / (n_treated * n_periods)) ** 0.5
    )

    return PowerResult(
        mde=float(mde),
        alpha=alpha,
        power=power,
        n_units=n_units,
        n_periods=n_periods,
        icc=icc,
        variance_explained=r_squared,
    )

regression_discontinuity

regression_discontinuity(
    running_variable: Any,
    outcome: Any,
    cutoff: float = 0.0,
    *,
    kernel: str = "triangular",
    bandwidth: float | None = None,
    polynomial_order: int = 1,
) -> RDResult

Estimate a local treatment effect at a sharp cutoff.

Fits local polynomials on each side of the cutoff, using the Imbens-Kalyanaraman (IK) or Calonico-Cattaneo-Titiunik (CCT) bandwidth selector when bandwidth is None.

Parameters:

Name Type Description Default
running_variable Any

Array-like running (assignment) variable.

required
outcome Any

Array-like outcome variable of the same length.

required
cutoff float

The threshold value of the running variable. Defaults to 0.0.

0.0
kernel str

Kernel for local weighting. One of "triangular" (default), "epanechnikov", or "uniform".

'triangular'
bandwidth float | None

Bandwidth for the local polynomial fit. When None, an optimal bandwidth is selected automatically.

None
polynomial_order int

Degree of the local polynomial. Defaults to 1 (local linear).

1

Returns:

Name Type Description
An RDResult

class:RDResult with the treatment effect estimate,

RDResult

robust standard error, bias-corrected confidence interval,

RDResult

effective sample sizes, and bandwidth.

Raises:

Type Description
ImportError

If numpy or scipy is not installed.

ValueError

If arrays are mismatched or too few observations exist on either side.

Source code in src/nyc311/stats/_rdd.py
 44
 45
 46
 47
 48
 49
 50
 51
 52
 53
 54
 55
 56
 57
 58
 59
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
def regression_discontinuity(
    running_variable: Any,
    outcome: Any,
    cutoff: float = 0.0,
    *,
    kernel: str = "triangular",
    bandwidth: float | None = None,
    polynomial_order: int = 1,
) -> RDResult:
    """Estimate a local treatment effect at a sharp cutoff.

    Fits local polynomials on each side of the cutoff, using the
    Imbens-Kalyanaraman (IK) or Calonico-Cattaneo-Titiunik (CCT)
    bandwidth selector when ``bandwidth`` is ``None``.

    Args:
        running_variable: Array-like running (assignment) variable.
        outcome: Array-like outcome variable of the same length.
        cutoff: The threshold value of the running variable.
            Defaults to ``0.0``.
        kernel: Kernel for local weighting. One of ``"triangular"``
            (default), ``"epanechnikov"``, or ``"uniform"``.
        bandwidth: Bandwidth for the local polynomial fit. When
            ``None``, an optimal bandwidth is selected automatically.
        polynomial_order: Degree of the local polynomial.
            Defaults to ``1`` (local linear).

    Returns:
        An :class:`RDResult` with the treatment effect estimate,
        robust standard error, bias-corrected confidence interval,
        effective sample sizes, and bandwidth.

    Raises:
        ImportError: If numpy or scipy is not installed.
        ValueError: If arrays are mismatched or too few observations
            exist on either side.
    """
    try:
        import numpy as np
        from scipy.stats import norm
    except ImportError as exc:
        msg = (
            "numpy and scipy are required for regression_discontinuity(). "
            "Install with: pip install nyc311[stats]"
        )
        raise ImportError(msg) from exc

    x = np.asarray(running_variable, dtype=float)
    y = np.asarray(outcome, dtype=float)

    if len(x) != len(y):
        msg = "running_variable and outcome must have the same length."
        raise ValueError(msg)

    x_centered = x - cutoff
    left_mask = x_centered < 0
    right_mask = x_centered >= 0

    if left_mask.sum() < 3 or right_mask.sum() < 3:
        msg = "Need at least 3 observations on each side of the cutoff."
        raise ValueError(msg)

    if bandwidth is None:
        bandwidth = _ik_bandwidth(x_centered, y)

    bw_left = bandwidth
    bw_right = bandwidth

    left_bw_mask = left_mask & (x_centered >= -bw_left)
    right_bw_mask = right_mask & (x_centered <= bw_right)

    n_left = int(left_bw_mask.sum())
    n_right = int(right_bw_mask.sum())

    if n_left < 2 or n_right < 2:
        msg = "Too few observations within bandwidth."
        raise ValueError(msg)

    def _kernel_weights(u: Any) -> Any:
        u_abs = np.abs(u)
        if kernel == "triangular":
            return np.maximum(1.0 - u_abs, 0.0)
        if kernel == "epanechnikov":
            return np.maximum(0.75 * (1.0 - u_abs**2), 0.0)
        return np.ones_like(u_abs)

    x_left = x_centered[left_bw_mask]
    y_left = y[left_bw_mask]
    w_left = _kernel_weights(x_left / bw_left)

    x_right = x_centered[right_bw_mask]
    y_right = y[right_bw_mask]
    w_right = _kernel_weights(x_right / bw_right)

    def _wls_fit(xv: Any, yv: Any, wv: Any, order: int) -> tuple[Any, Any]:
        design = np.column_stack([xv**p for p in range(order + 1)])
        wm = np.diag(wv)
        xtw = design.T @ wm
        beta = np.linalg.solve(xtw @ design, xtw @ yv)
        resid = yv - design @ beta
        bread = np.linalg.inv(xtw @ design)
        meat = design.T @ np.diag((wv * resid) ** 2) @ design
        vcov = bread @ meat @ bread
        return beta, vcov

    beta_left, vcov_left = _wls_fit(x_left, y_left, w_left, polynomial_order)
    beta_right, vcov_right = _wls_fit(x_right, y_right, w_right, polynomial_order)

    tau = float(beta_right[0] - beta_left[0])
    se = float(np.sqrt(vcov_left[0, 0] + vcov_right[0, 0]))
    se = max(se, 1e-10)

    z = tau / se
    p_value = float(2.0 * (1.0 - norm.cdf(abs(z))))
    ci_lo = tau - 1.96 * se
    ci_hi = tau + 1.96 * se

    summary = (
        f"RD Estimate: {tau:.4f} (SE={se:.4f}, p={p_value:.4f})\n"
        f"Bandwidth: [{bw_left:.4f}, {bw_right:.4f}]\n"
        f"Effective N: {n_left} (left), {n_right} (right)\n"
        f"Kernel: {kernel}, Polynomial order: {polynomial_order}"
    )

    return RDResult(
        treatment_effect=tau,
        se_robust=se,
        p_value=p_value,
        ci_lower=ci_lo,
        ci_upper=ci_hi,
        bandwidth_left=bw_left,
        bandwidth_right=bw_right,
        n_effective_left=n_left,
        n_effective_right=n_right,
        kernel=kernel,
        model_summary=summary,
    )

latent_reporting_bias_em

latent_reporting_bias_em(
    complaint_counts: dict[str, int],
    populations: dict[str, int],
    covariates: dict[str, dict[str, float]] | None = None,
    *,
    max_iter: int = 200,
    tol: float = 1e-06,
) -> LatentReportingResult

Estimate true complaint rates via expectation-maximization.

Models observed counts as a product of a latent true rate and a reporting probability. The EM algorithm iterates between estimating true rates (M-step, Poisson MLE) and reporting probabilities (M-step, logistic on covariates).

Parameters:

Name Type Description Default
complaint_counts dict[str, int]

Mapping {unit_id: observed_count}.

required
populations dict[str, int]

Mapping {unit_id: population}.

required
covariates dict[str, dict[str, float]] | None

Optional mapping {unit_id: {covariate_name: value}}. When None, a uniform reporting probability is assumed.

None
max_iter int

Maximum EM iterations.

200
tol float

Convergence tolerance on log-likelihood change.

1e-06

Returns:

Name Type Description
A LatentReportingResult

class:LatentReportingResult with estimated true rates,

LatentReportingResult

reporting probabilities, and convergence diagnostics.

Raises:

Type Description
ImportError

If numpy or scipy is not installed.

Source code in src/nyc311/stats/_reporting_bias.py
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
def latent_reporting_bias_em(
    complaint_counts: dict[str, int],
    populations: dict[str, int],
    covariates: dict[str, dict[str, float]] | None = None,
    *,
    max_iter: int = 200,
    tol: float = 1e-6,
) -> LatentReportingResult:
    """Estimate true complaint rates via expectation-maximization.

    Models observed counts as a product of a latent true rate and a
    reporting probability.  The EM algorithm iterates between
    estimating true rates (M-step, Poisson MLE) and reporting
    probabilities (M-step, logistic on covariates).

    Args:
        complaint_counts: Mapping ``{unit_id: observed_count}``.
        populations: Mapping ``{unit_id: population}``.
        covariates: Optional mapping
            ``{unit_id: {covariate_name: value}}``.  When ``None``,
            a uniform reporting probability is assumed.
        max_iter: Maximum EM iterations.
        tol: Convergence tolerance on log-likelihood change.

    Returns:
        A :class:`LatentReportingResult` with estimated true rates,
        reporting probabilities, and convergence diagnostics.

    Raises:
        ImportError: If numpy or scipy is not installed.
    """
    try:
        import numpy as np
        from scipy.special import expit
    except ImportError as exc:
        msg = (
            "numpy and scipy are required for latent_reporting_bias_em(). "
            "Install with: pip install nyc311[stats]"
        )
        raise ImportError(msg) from exc

    unit_ids = sorted(complaint_counts)
    n = len(unit_ids)

    y = np.array([complaint_counts[uid] for uid in unit_ids], dtype=float)
    pop = np.array([populations[uid] for uid in unit_ids], dtype=float)

    observed_rates = y / np.maximum(pop, 1.0)

    lambda_hat = observed_rates.copy() + 1e-8
    rho_hat = np.full(n, 0.5)

    if covariates is not None:
        cov_names = sorted(next(iter(covariates.values())).keys())
        x = np.column_stack(
            [
                np.array([covariates[uid][c] for uid in unit_ids], dtype=float)
                for c in cov_names
            ]
        )
        x = np.column_stack([np.ones(n), x])
        beta = np.zeros(x.shape[1])
    else:
        x = None
        beta = None

    ll_trace: list[float] = []
    converged = False

    for _iteration in range(max_iter):
        expected_true = y / np.maximum(rho_hat, 1e-10)
        lambda_hat = expected_true / np.maximum(pop, 1.0)
        lambda_hat = np.maximum(lambda_hat, 1e-10)

        if x is not None and beta is not None:
            for _ in range(5):
                rho_pred = expit(x @ beta)
                residual = (y / np.maximum(lambda_hat * pop, 1e-10)) - rho_pred
                grad = x.T @ residual
                hess = -x.T @ (np.diag(rho_pred * (1 - rho_pred)) @ x)
                try:
                    step = np.linalg.solve(hess, grad)
                    beta = beta - step
                except np.linalg.LinAlgError:
                    break
            rho_hat = expit(x @ beta)
        else:
            rho_hat = np.clip(y / np.maximum(lambda_hat * pop, 1e-10), 0.01, 0.99)

        ll = float(
            np.sum(
                y * np.log(np.maximum(lambda_hat * pop * rho_hat, 1e-10))
                - lambda_hat * pop * rho_hat
            )
        )
        ll_trace.append(ll)

        if len(ll_trace) > 1 and abs(ll_trace[-1] - ll_trace[-2]) < tol:
            converged = True
            break

    return LatentReportingResult(
        estimated_true_rates={
            uid: float(lambda_hat[i]) for i, uid in enumerate(unit_ids)
        },
        reporting_probabilities={
            uid: float(rho_hat[i]) for i, uid in enumerate(unit_ids)
        },
        observed_rates={
            uid: float(observed_rates[i]) for i, uid in enumerate(unit_ids)
        },
        n_iterations=len(ll_trace),
        converged=converged,
        log_likelihood_trace=tuple(ll_trace),
    )

reporting_rate_adjustment

reporting_rate_adjustment(
    panel: PanelDataset,
    outcome: str,
    demographic_covariates: tuple[str, ...],
) -> ReportingAdjustmentResult

Adjust complaint rates for neighborhood reporting propensity.

Fits a mixed-effects model with unit random intercepts:

outcome ~ covariates + (1 | unit_id)

The random intercepts capture unit-level reporting propensity after controlling for demographic covariates.

Parameters:

Name Type Description Default
panel PanelDataset

A :class:PanelDataset with covariates attached.

required
outcome str

Column name for the complaint rate to adjust.

required
demographic_covariates tuple[str, ...]

Column names for demographic controls (e.g. median income, population density).

required

Returns:

Name Type Description
A ReportingAdjustmentResult

class:ReportingAdjustmentResult with raw and adjusted

ReportingAdjustmentResult

rates, random intercepts, ICC, and model summary.

Raises:

Type Description
ImportError

If statsmodels or pandas is not installed.

Source code in src/nyc311/stats/_reporting_bias.py
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
def reporting_rate_adjustment(
    panel: PanelDataset,
    outcome: str,
    demographic_covariates: tuple[str, ...],
) -> ReportingAdjustmentResult:
    """Adjust complaint rates for neighborhood reporting propensity.

    Fits a mixed-effects model with unit random intercepts:

        outcome ~ covariates + (1 | unit_id)

    The random intercepts capture unit-level reporting propensity
    after controlling for demographic covariates.

    Args:
        panel: A :class:`PanelDataset` with covariates attached.
        outcome: Column name for the complaint rate to adjust.
        demographic_covariates: Column names for demographic controls
            (e.g. median income, population density).

    Returns:
        A :class:`ReportingAdjustmentResult` with raw and adjusted
        rates, random intercepts, ICC, and model summary.

    Raises:
        ImportError: If statsmodels or pandas is not installed.
    """
    try:
        from statsmodels.regression.mixed_linear_model import MixedLM
    except ImportError as exc:
        msg = (
            "statsmodels and pandas are required for "
            "reporting_rate_adjustment(). "
            "Install with: pip install nyc311[stats]"
        )
        raise ImportError(msg) from exc

    df = panel.to_dataframe()
    df = df.reset_index()

    formula_parts = [outcome, "~", " + ".join(demographic_covariates)]
    formula = " ".join(formula_parts)

    model = MixedLM.from_formula(
        formula,
        groups="unit_id",
        data=df,
    )
    result = model.fit(reml=True)  # pylint: disable=unexpected-keyword-arg

    unit_ids = sorted(df["unit_id"].unique())
    raw_rates: dict[str, float] = {}
    for uid in unit_ids:
        mask = df["unit_id"] == uid
        raw_rates[uid] = float(df.loc[mask, outcome].mean())

    re = result.random_effects
    adjustment_factors: dict[str, float] = {}
    for uid in unit_ids:
        adjustment_factors[uid] = float(re[uid].iloc[0]) if uid in re else 0.0

    group_var = (
        float(result.cov_re.iloc[0, 0])
        if hasattr(result.cov_re, "iloc")
        else float(result.cov_re)
    )
    resid_var = float(result.scale)
    icc = group_var / (group_var + resid_var) if (group_var + resid_var) > 0 else 0.0

    adjusted_rates: dict[str, float] = {}
    for uid in unit_ids:
        adjusted_rates[uid] = raw_rates[uid] - adjustment_factors[uid]

    return ReportingAdjustmentResult(
        raw_rates=raw_rates,
        adjusted_rates=adjusted_rates,
        adjustment_factors=adjustment_factors,
        covariates_used=demographic_covariates,
        icc=icc,
        model_summary=str(result.summary()),
    )

global_morans_i

global_morans_i(
    values: dict[str, float],
    weights: dict[str, dict[str, float]],
) -> MoranResult

Compute Global Moran's I for values under spatial weights.

Parameters:

Name Type Description Default
values dict[str, float]

Mapping {unit_id: numeric_value} to test for spatial autocorrelation. Unit IDs must align with those in weights.

required
weights dict[str, dict[str, float]]

Nested dict {unit_a: {unit_b: weight}} describing the spatial weights matrix; typically row-standardized.

required

Returns:

Name Type Description
A MoranResult

class:MoranResult with the Moran's I statistic, the

MoranResult

permutation-based p-value, the standardized z-score, and the

MoranResult

expected value under the null hypothesis.

Raises:

Type Description
ImportError

If esda or libpysal is not installed. Install the optional stats extra with pip install nyc311[stats].

Source code in src/nyc311/stats/_spatial.py
 53
 54
 55
 56
 57
 58
 59
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
def global_morans_i(
    values: dict[str, float],
    weights: dict[str, dict[str, float]],
) -> MoranResult:
    """Compute Global Moran's I for ``values`` under spatial ``weights``.

    Args:
        values: Mapping ``{unit_id: numeric_value}`` to test for spatial
            autocorrelation. Unit IDs must align with those in
            ``weights``.
        weights: Nested dict ``{unit_a: {unit_b: weight}}`` describing
            the spatial weights matrix; typically row-standardized.

    Returns:
        A :class:`MoranResult` with the Moran's I statistic, the
        permutation-based p-value, the standardized z-score, and the
        expected value under the null hypothesis.

    Raises:
        ImportError: If ``esda`` or ``libpysal`` is not installed.
            Install the optional stats extra with
            ``pip install nyc311[stats]``.
    """
    try:
        import numpy as np
        from esda.moran import Moran
        from libpysal.weights import W
    except ImportError as exc:
        message = (
            "esda and libpysal are required for spatial autocorrelation. "
            "Install with: pip install nyc311[stats]"
        )
        raise ImportError(message) from exc

    unit_ids = sorted(values)
    y = np.array([values[uid] for uid in unit_ids])

    neighbors = {uid: list(weights.get(uid, {}).keys()) for uid in unit_ids}
    weight_vals = {uid: list(weights.get(uid, {}).values()) for uid in unit_ids}
    w = W(neighbors, weight_vals)

    mi = Moran(y, w)
    return MoranResult(
        statistic=float(mi.I),
        p_value=float(mi.p_sim),
        z_score=float(mi.z_sim),
        expected=float(mi.EI),
    )

local_morans_i

local_morans_i(
    values: dict[str, float],
    weights: dict[str, dict[str, float]],
    *,
    permutations: int = 999,
) -> LISAResult

Compute Local Moran's I (LISA) for hotspot/coldspot identification.

Parameters:

Name Type Description Default
values dict[str, float]

Mapping {unit_id: numeric_value} for the variable being tested.

required
weights dict[str, dict[str, float]]

Nested dict {unit_a: {unit_b: weight}} describing the spatial weights matrix.

required
permutations int

Number of conditional permutations used to derive pseudo p-values.

999

Returns:

Name Type Description
A LISAResult

class:LISAResult containing the local statistic, pseudo

LISAResult

p-values, and quadrant cluster labels ("HH", "LH",

LISAResult

"LL", "HL", or "ns" for non-significant) per unit.

Raises:

Type Description
ImportError

If esda or libpysal is not installed. Install the optional stats extra with pip install nyc311[stats].

Source code in src/nyc311/stats/_spatial.py
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
def local_morans_i(
    values: dict[str, float],
    weights: dict[str, dict[str, float]],
    *,
    permutations: int = 999,
) -> LISAResult:
    """Compute Local Moran's I (LISA) for hotspot/coldspot identification.

    Args:
        values: Mapping ``{unit_id: numeric_value}`` for the variable
            being tested.
        weights: Nested dict ``{unit_a: {unit_b: weight}}`` describing
            the spatial weights matrix.
        permutations: Number of conditional permutations used to derive
            pseudo p-values.

    Returns:
        A :class:`LISAResult` containing the local statistic, pseudo
        p-values, and quadrant cluster labels (``"HH"``, ``"LH"``,
        ``"LL"``, ``"HL"``, or ``"ns"`` for non-significant) per unit.

    Raises:
        ImportError: If ``esda`` or ``libpysal`` is not installed.
            Install the optional stats extra with
            ``pip install nyc311[stats]``.
    """
    try:
        import numpy as np
        from esda.moran import Moran_Local
        from libpysal.weights import W
    except ImportError as exc:
        message = (
            "esda and libpysal are required for LISA analysis. "
            "Install with: pip install nyc311[stats]"
        )
        raise ImportError(message) from exc

    unit_ids = sorted(values)
    y = np.array([values[uid] for uid in unit_ids])

    neighbors = {uid: list(weights.get(uid, {}).keys()) for uid in unit_ids}
    weight_vals = {uid: list(weights.get(uid, {}).values()) for uid in unit_ids}
    w = W(neighbors, weight_vals)

    lisa = Moran_Local(y, w, permutations=permutations)

    labels: list[str] = []
    for i, quad in enumerate(lisa.q):
        if lisa.p_sim[i] < 0.05:
            labels.append(_LISA_QUAD_LABELS.get(int(quad), "ns"))
        else:
            labels.append("ns")

    return LISAResult(
        local_statistic=tuple(float(x) for x in lisa.Is),
        p_values=tuple(float(x) for x in lisa.p_sim),
        cluster_labels=tuple(labels),
        unit_ids=tuple(unit_ids),
    )

spatial_error_model

spatial_error_model(
    panel: PanelDataset,
    weights: dict[str, dict[str, float]],
    outcome: str,
    regressors: tuple[str, ...],
    *,
    period: str | None = None,
) -> SpatialErrorResult

Fit a spatial error (SEM) model via maximum likelihood.

Estimates: y = X @ beta + u, u = lambda * W @ u + epsilon

Parameters:

Name Type Description Default
panel PanelDataset

A :class:PanelDataset containing the outcome and regressor columns.

required
weights dict[str, dict[str, float]]

Nested dict {unit_a: {unit_b: weight}} of spatial weights (row-standardized).

required
outcome str

Column name for the dependent variable.

required
regressors tuple[str, ...]

Column names for the independent variables.

required
period str | None

If given, extract only this period as a cross-section. If None, collapse across periods via group means.

None

Returns:

Name Type Description
A SpatialErrorResult

class:SpatialErrorResult with estimated coefficients, the

SpatialErrorResult

spatial error parameter (lambda), and fit statistics.

Raises:

Type Description
ImportError

If spreg or libpysal is not installed.

Source code in src/nyc311/stats/_spatial_regression.py
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
def spatial_error_model(
    panel: PanelDataset,
    weights: dict[str, dict[str, float]],
    outcome: str,
    regressors: tuple[str, ...],
    *,
    period: str | None = None,
) -> SpatialErrorResult:
    """Fit a spatial error (SEM) model via maximum likelihood.

    Estimates: y = X @ beta + u,  u = lambda * W @ u + epsilon

    Args:
        panel: A :class:`PanelDataset` containing the outcome and
            regressor columns.
        weights: Nested dict ``{unit_a: {unit_b: weight}}`` of spatial
            weights (row-standardized).
        outcome: Column name for the dependent variable.
        regressors: Column names for the independent variables.
        period: If given, extract only this period as a cross-section.
            If ``None``, collapse across periods via group means.

    Returns:
        A :class:`SpatialErrorResult` with estimated coefficients, the
        spatial error parameter (lambda), and fit statistics.

    Raises:
        ImportError: If spreg or libpysal is not installed.
    """
    try:
        import numpy as np
        from libpysal.weights import W
        from spreg import ML_Error
    except ImportError as exc:
        msg = (
            "spreg and libpysal are required for spatial_error_model(). "
            "Install with: pip install nyc311[stats]"
        )
        raise ImportError(msg) from exc

    df = _extract_cross_section(panel, outcome, regressors, period)
    unit_ids = list(df.index)

    neighbors = {uid: list(weights.get(str(uid), {}).keys()) for uid in unit_ids}
    weight_vals = {uid: list(weights.get(str(uid), {}).values()) for uid in unit_ids}
    w = W(neighbors, weight_vals)

    y = np.asarray(df[outcome].values, dtype=float).reshape(-1, 1)
    x = np.column_stack([np.asarray(df[r].values, dtype=float) for r in regressors])

    model = ML_Error(y, x, w, name_y=outcome, name_x=list(regressors))

    var_names = ["CONSTANT", *regressors]
    n_betas = len(var_names)
    coefficients = {var_names[i]: float(model.betas[i][0]) for i in range(n_betas)}
    std_errors = {var_names[i]: float(model.std_err[i]) for i in range(n_betas)}  # pylint: disable=no-member
    p_values = {var_names[i]: float(model.z_stat[i][1]) for i in range(n_betas)}  # pylint: disable=no-member

    lam = float(model.betas[n_betas][0])
    lam_p = float(model.z_stat[n_betas][1])  # pylint: disable=no-member

    return SpatialErrorResult(
        coefficients=coefficients,
        std_errors=std_errors,
        p_values=p_values,
        lam=lam,
        lam_p_value=lam_p,
        log_likelihood=float(model.logll),
        aic=float(model.aic),
        n_observations=int(model.n),
        model_summary=str(model.summary),  # pylint: disable=no-member
    )

spatial_lag_model

spatial_lag_model(
    panel: PanelDataset,
    weights: dict[str, dict[str, float]],
    outcome: str,
    regressors: tuple[str, ...],
    *,
    period: str | None = None,
) -> SpatialLagResult

Fit a spatial lag (SAR) model via maximum likelihood.

Estimates: y = rho * W @ y + X @ beta + epsilon

Parameters:

Name Type Description Default
panel PanelDataset

A :class:PanelDataset containing the outcome and regressor columns.

required
weights dict[str, dict[str, float]]

Nested dict {unit_a: {unit_b: weight}} of spatial weights (row-standardized).

required
outcome str

Column name for the dependent variable.

required
regressors tuple[str, ...]

Column names for the independent variables.

required
period str | None

If given, extract only this period as a cross-section. If None, collapse across periods via group means.

None

Returns:

Name Type Description
A SpatialLagResult

class:SpatialLagResult with estimated coefficients, the

SpatialLagResult

spatial autoregressive parameter (rho), and fit statistics.

Raises:

Type Description
ImportError

If spreg or libpysal is not installed.

Source code in src/nyc311/stats/_spatial_regression.py
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
def spatial_lag_model(
    panel: PanelDataset,
    weights: dict[str, dict[str, float]],
    outcome: str,
    regressors: tuple[str, ...],
    *,
    period: str | None = None,
) -> SpatialLagResult:
    """Fit a spatial lag (SAR) model via maximum likelihood.

    Estimates: y = rho * W @ y + X @ beta + epsilon

    Args:
        panel: A :class:`PanelDataset` containing the outcome and
            regressor columns.
        weights: Nested dict ``{unit_a: {unit_b: weight}}`` of spatial
            weights (row-standardized).
        outcome: Column name for the dependent variable.
        regressors: Column names for the independent variables.
        period: If given, extract only this period as a cross-section.
            If ``None``, collapse across periods via group means.

    Returns:
        A :class:`SpatialLagResult` with estimated coefficients, the
        spatial autoregressive parameter (rho), and fit statistics.

    Raises:
        ImportError: If spreg or libpysal is not installed.
    """
    try:
        import numpy as np
        from libpysal.weights import W
        from spreg import ML_Lag
    except ImportError as exc:
        msg = (
            "spreg and libpysal are required for spatial_lag_model(). "
            "Install with: pip install nyc311[stats]"
        )
        raise ImportError(msg) from exc

    df = _extract_cross_section(panel, outcome, regressors, period)
    unit_ids = list(df.index)

    neighbors = {uid: list(weights.get(str(uid), {}).keys()) for uid in unit_ids}
    weight_vals = {uid: list(weights.get(str(uid), {}).values()) for uid in unit_ids}
    w = W(neighbors, weight_vals)

    y = np.asarray(df[outcome].values, dtype=float).reshape(-1, 1)
    x = np.column_stack([np.asarray(df[r].values, dtype=float) for r in regressors])

    model = ML_Lag(y, x, w, name_y=outcome, name_x=list(regressors))

    var_names = ["CONSTANT", *regressors]
    n_betas = len(var_names)
    coefficients = {var_names[i]: float(model.betas[i][0]) for i in range(n_betas)}
    std_errors = {var_names[i]: float(model.std_err[i]) for i in range(n_betas)}  # pylint: disable=no-member
    p_values = {var_names[i]: float(model.z_stat[i][1]) for i in range(n_betas)}  # pylint: disable=no-member

    rho = float(model.betas[n_betas][0])
    rho_p = float(model.z_stat[n_betas][1])  # pylint: disable=no-member

    return SpatialLagResult(
        coefficients=coefficients,
        std_errors=std_errors,
        p_values=p_values,
        rho=rho,
        rho_p_value=rho_p,
        log_likelihood=float(model.logll),
        aic=float(model.aic),
        n_observations=int(model.n),
        model_summary=str(model.summary),  # pylint: disable=no-member
    )

event_study

event_study(
    panel: PanelDataset,
    outcome: str,
    *,
    covariates: tuple[str, ...] = (),
    pre_periods: int = 5,
    post_periods: int = 5,
    reference_period: int = -1,
) -> EventStudyResult

Estimate event-study coefficients with pre-trend diagnostics.

Computes mean differences between treated and control units at each relative time period, with reference_period normalized to zero.

Parameters:

Name Type Description Default
panel PanelDataset

A :class:PanelDataset with treatment_events.

required
outcome str

Column name for the outcome variable.

required
covariates tuple[str, ...]

Additional control variable column names.

()
pre_periods int

Number of pre-treatment periods to include.

5
post_periods int

Number of post-treatment periods to include.

5
reference_period int

Relative period to normalize to zero. Defaults to -1 (one period before treatment).

-1

Returns:

Name Type Description
An EventStudyResult

class:EventStudyResult with coefficients per relative

EventStudyResult

period, confidence intervals, and a pre-trend F-test.

Raises:

Type Description
ImportError

If required packages are not installed.

Source code in src/nyc311/stats/_staggered_did.py
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
def event_study(
    panel: PanelDataset,
    outcome: str,
    *,
    covariates: tuple[str, ...] = (),
    pre_periods: int = 5,
    post_periods: int = 5,
    reference_period: int = -1,
) -> EventStudyResult:
    """Estimate event-study coefficients with pre-trend diagnostics.

    Computes mean differences between treated and control units at
    each relative time period, with ``reference_period`` normalized
    to zero.

    Args:
        panel: A :class:`PanelDataset` with ``treatment_events``.
        outcome: Column name for the outcome variable.
        covariates: Additional control variable column names.
        pre_periods: Number of pre-treatment periods to include.
        post_periods: Number of post-treatment periods to include.
        reference_period: Relative period to normalize to zero.
            Defaults to ``-1`` (one period before treatment).

    Returns:
        An :class:`EventStudyResult` with coefficients per relative
        period, confidence intervals, and a pre-trend F-test.

    Raises:
        ImportError: If required packages are not installed.
    """
    try:
        import numpy as np
        import pandas as pd
        from scipy.stats import f as f_dist
    except ImportError as exc:
        msg = (
            "numpy, pandas, and scipy are required for event_study(). "
            "Install with: pip install nyc311[stats]"
        )
        raise ImportError(msg) from exc

    _ = covariates  # reserved for future covariate adjustment

    df = panel.to_dataframe()
    if isinstance(df.index, pd.MultiIndex):
        df = df.reset_index()

    unit_treatment_dates: dict[str, str] = {}
    for te in panel.treatment_events:
        date_str = te.treatment_date.isoformat()[:7]
        for uid in te.treated_units:
            if uid not in unit_treatment_dates or date_str < unit_treatment_dates[uid]:
                unit_treatment_dates[uid] = date_str

    periods = sorted(df["period"].unique())
    period_to_idx = {p: i for i, p in enumerate(periods)}

    treated_units = set(unit_treatment_dates.keys())
    control_units = set(df["unit_id"].unique()) - treated_units

    rel_range = list(range(-pre_periods, post_periods + 1))
    coeffs: list[float] = []
    ses: list[float] = []

    for rel in rel_range:
        diffs: list[float] = []
        for uid, treat_period in unit_treatment_dates.items():
            if treat_period not in period_to_idx:
                continue
            abs_idx = period_to_idx[treat_period] + rel
            if abs_idx < 0 or abs_idx >= len(periods):
                continue
            target_period = periods[abs_idx]

            treat_vals = df[(df["unit_id"] == uid) & (df["period"] == target_period)][
                outcome
            ].to_numpy()
            ctrl_vals = df[
                (df["unit_id"].isin(control_units)) & (df["period"] == target_period)
            ][outcome].to_numpy()

            if len(treat_vals) > 0 and len(ctrl_vals) > 0:
                diffs.append(float(np.mean(treat_vals) - np.mean(ctrl_vals)))

        if diffs:
            coeffs.append(float(np.mean(diffs)))
            ses.append(
                float(np.std(diffs, ddof=1) / np.sqrt(len(diffs)))
                if len(diffs) > 1
                else 0.0
            )
        else:
            coeffs.append(0.0)
            ses.append(0.0)

    ref_idx = rel_range.index(reference_period) if reference_period in rel_range else 0
    ref_coeff = coeffs[ref_idx]
    coeffs = [c - ref_coeff for c in coeffs]

    ci_lower = [c - 1.96 * s for c, s in zip(coeffs, ses, strict=True)]
    ci_upper = [c + 1.96 * s for c, s in zip(coeffs, ses, strict=True)]

    pre_indices = [
        i for i, r in enumerate(rel_range) if r < 0 and r != reference_period
    ]
    pre_f = None
    pre_p = None
    if pre_indices and any(ses[i] > 0 for i in pre_indices):
        pre_coeffs = np.array([coeffs[i] for i in pre_indices])
        pre_ses = np.array([max(ses[i], 1e-10) for i in pre_indices])
        f_stat = float(np.mean((pre_coeffs / pre_ses) ** 2))
        k = len(pre_indices)
        pre_f = f_stat
        pre_p = float(1.0 - f_dist.cdf(f_stat, k, max(k, 1)))

    return EventStudyResult(
        coefficients=tuple(coeffs),
        std_errors=tuple(ses),
        ci_lower=tuple(ci_lower),
        ci_upper=tuple(ci_upper),
        relative_periods=tuple(rel_range),
        pre_trend_f_statistic=pre_f,
        pre_trend_p_value=pre_p,
        reference_period=reference_period,
    )

staggered_did

staggered_did(
    panel: PanelDataset,
    outcome: str,
    *,
    covariates: tuple[str, ...] = (),
    cluster: str = "entity",
) -> StaggeredDiDResult

Estimate group-time ATTs under staggered treatment adoption.

Uses two-way fixed effects with interaction terms for each treatment cohort and post-treatment period, avoiding the well-documented bias of naive TWFE under staggered rollouts.

Parameters:

Name Type Description Default
panel PanelDataset

A :class:PanelDataset with treatment_events specifying when each unit began treatment.

required
outcome str

Column name for the outcome variable.

required
covariates tuple[str, ...]

Additional control variable column names.

()
cluster str

Clustering level for standard errors. One of "entity" (default) or "time".

'entity'

Returns:

Name Type Description
A StaggeredDiDResult

class:StaggeredDiDResult with group-time ATTs,

StaggeredDiDResult

aggregated ATT, and confidence intervals.

Raises:

Type Description
ImportError

If required packages are not installed.

ValueError

If no treatment events are found.

Source code in src/nyc311/stats/_staggered_did.py
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
def staggered_did(
    panel: PanelDataset,
    outcome: str,
    *,
    covariates: tuple[str, ...] = (),
    cluster: str = "entity",
) -> StaggeredDiDResult:
    """Estimate group-time ATTs under staggered treatment adoption.

    Uses two-way fixed effects with interaction terms for each
    treatment cohort and post-treatment period, avoiding the
    well-documented bias of naive TWFE under staggered rollouts.

    Args:
        panel: A :class:`PanelDataset` with ``treatment_events``
            specifying when each unit began treatment.
        outcome: Column name for the outcome variable.
        covariates: Additional control variable column names.
        cluster: Clustering level for standard errors. One of
            ``"entity"`` (default) or ``"time"``.

    Returns:
        A :class:`StaggeredDiDResult` with group-time ATTs,
        aggregated ATT, and confidence intervals.

    Raises:
        ImportError: If required packages are not installed.
        ValueError: If no treatment events are found.
    """
    try:
        import numpy as np
        import pandas as pd
        from scipy.stats import norm
    except ImportError as exc:
        msg = (
            "numpy, pandas, and scipy are required for staggered_did(). "
            "Install with: pip install nyc311[stats]"
        )
        raise ImportError(msg) from exc

    _ = covariates, cluster  # reserved for future covariate adjustment and clustering

    if not panel.treatment_events:
        msg = "Panel must have at least one treatment event."
        raise ValueError(msg)

    df = panel.to_dataframe()
    if isinstance(df.index, pd.MultiIndex):
        df = df.reset_index()

    unit_treatment_dates: dict[str, str] = {}
    for te in panel.treatment_events:
        date_str = te.treatment_date.isoformat()[:7]
        for uid in te.treated_units:
            if uid not in unit_treatment_dates or date_str < unit_treatment_dates[uid]:
                unit_treatment_dates[uid] = date_str

    df["cohort"] = df["unit_id"].map(unit_treatment_dates).fillna("never")
    df["post"] = ((df["cohort"] != "never") & (df["period"] >= df["cohort"])).astype(
        int
    )

    cohorts = sorted(set(unit_treatment_dates.values()))
    periods = sorted(df["period"].unique())

    gt_atts: list[GroupTimeATT] = []
    for cohort in cohorts:
        cohort_units = df[df["cohort"] == cohort]
        never_units = df[df["cohort"] == "never"]

        for period in periods:
            treated_obs = cohort_units[cohort_units["period"] == period]
            control_obs = never_units[never_units["period"] == period]

            if len(treated_obs) == 0 or len(control_obs) == 0:
                continue

            y_t = treated_obs[outcome].to_numpy().astype(float)
            y_c = control_obs[outcome].to_numpy().astype(float)

            att_val = float(np.mean(y_t) - np.mean(y_c))
            var_t = float(np.var(y_t, ddof=1)) if len(y_t) > 1 else 0.0
            var_c = float(np.var(y_c, ddof=1)) if len(y_c) > 1 else 0.0
            se_val = float(np.sqrt(var_t / len(y_t) + var_c / len(y_c)))
            se_val = max(se_val, 1e-10)
            z_val = att_val / se_val
            p_val = float(2.0 * (1.0 - norm.cdf(abs(z_val))))

            gt_atts.append(
                GroupTimeATT(
                    group=cohort,
                    period=period,
                    att=att_val,
                    se=se_val,
                    p_value=p_val,
                )
            )

    if gt_atts:
        atts = np.array([g.att for g in gt_atts])
        ses = np.array([g.se for g in gt_atts])
        weights = 1.0 / np.maximum(ses**2, 1e-20)
        agg_att = float(np.average(atts, weights=weights))
        agg_se = float(1.0 / np.sqrt(np.sum(weights)))
        z_agg = agg_att / max(agg_se, 1e-10)
        agg_p = float(2.0 * (1.0 - norm.cdf(abs(z_agg))))
        agg_ci_lower = agg_att - 1.96 * agg_se
        agg_ci_upper = agg_att + 1.96 * agg_se
    else:
        agg_att = 0.0
        agg_se = 0.0
        agg_p = 1.0
        agg_ci_lower = 0.0
        agg_ci_upper = 0.0

    summary = (
        f"Staggered DiD: {len(cohorts)} cohort(s), {len(periods)} periods\n"
        f"Group-time ATTs: {len(gt_atts)}\n"
        f"Aggregated ATT: {agg_att:.4f} (SE={agg_se:.4f}, p={agg_p:.4f})"
    )

    return StaggeredDiDResult(
        group_time_atts=tuple(gt_atts),
        aggregated_att=agg_att,
        aggregated_se=agg_se,
        aggregated_p_value=agg_p,
        aggregated_ci_lower=agg_ci_lower,
        aggregated_ci_upper=agg_ci_upper,
        n_groups=len(cohorts),
        n_periods=len(periods),
        model_summary=summary,
    )

synthetic_control

synthetic_control(
    panel: PanelDataset,
    treated_unit: str,
    outcome: str,
    *,
    predictors: tuple[str, ...] = (),
    n_placebo_runs: int = 0,
) -> SyntheticControlResult

Estimate a treatment effect using the synthetic control method.

Constructs a weighted combination of untreated donor units that best reproduces the treated unit's pre-treatment trajectory, then measures the post-treatment divergence as the treatment effect.

Parameters:

Name Type Description Default
panel PanelDataset

A :class:PanelDataset with treatment information.

required
treated_unit str

The unit ID of the treated unit.

required
outcome str

Column name for the outcome variable.

required
predictors tuple[str, ...]

Additional predictor columns for matching.

()
n_placebo_runs int

Number of in-space placebos for inference. When > 0, each donor unit is iteratively treated and the ratio of post/pre MSPE is used to compute a p-value. Defaults to 0 (no placebos).

0

Returns:

Name Type Description
A SyntheticControlResult

class:SyntheticControlResult with donor weights,

SyntheticControlResult

counterfactual series, treatment effects, and optionally a

SyntheticControlResult

placebo p-value.

Raises:

Type Description
ImportError

If pysyncon is not installed.

ValueError

If the treated unit is not found in the panel.

Source code in src/nyc311/stats/_synthetic_control.py
 40
 41
 42
 43
 44
 45
 46
 47
 48
 49
 50
 51
 52
 53
 54
 55
 56
 57
 58
 59
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
def synthetic_control(
    panel: PanelDataset,
    treated_unit: str,
    outcome: str,
    *,
    predictors: tuple[str, ...] = (),
    n_placebo_runs: int = 0,
) -> SyntheticControlResult:
    """Estimate a treatment effect using the synthetic control method.

    Constructs a weighted combination of untreated donor units that
    best reproduces the treated unit's pre-treatment trajectory, then
    measures the post-treatment divergence as the treatment effect.

    Args:
        panel: A :class:`PanelDataset` with treatment information.
        treated_unit: The unit ID of the treated unit.
        outcome: Column name for the outcome variable.
        predictors: Additional predictor columns for matching.
        n_placebo_runs: Number of in-space placebos for inference.
            When ``> 0``, each donor unit is iteratively treated and
            the ratio of post/pre MSPE is used to compute a p-value.
            Defaults to ``0`` (no placebos).

    Returns:
        A :class:`SyntheticControlResult` with donor weights,
        counterfactual series, treatment effects, and optionally a
        placebo p-value.

    Raises:
        ImportError: If pysyncon is not installed.
        ValueError: If the treated unit is not found in the panel.
    """
    try:
        import numpy as np
        import pandas as pd
    except ImportError as exc:
        msg = (
            "numpy and pandas are required for synthetic_control(). "
            "Install with: pip install nyc311[stats]"
        )
        raise ImportError(msg) from exc

    _ = predictors  # reserved for future matching on covariates

    df = panel.to_dataframe()
    if isinstance(df.index, pd.MultiIndex):
        df = df.reset_index()

    if treated_unit not in df["unit_id"].to_numpy():
        msg = f"treated_unit {treated_unit!r} not found in panel."
        raise ValueError(msg)

    treatment_event = None
    for te in panel.treatment_events:
        if treated_unit in te.treated_units:
            treatment_event = te
            break

    if treatment_event is None:
        msg = f"No treatment event found for unit {treated_unit!r}."
        raise ValueError(msg)

    treatment_date_str = treatment_event.treatment_date.isoformat()[:7]
    periods = sorted(df["period"].unique())
    pre_periods = [p for p in periods if p < treatment_date_str]
    post_periods = [p for p in periods if p >= treatment_date_str]

    donor_ids = [
        uid
        for uid in panel.unit_ids
        if uid != treated_unit and uid not in treatment_event.treated_units
    ]

    pivot = df.pivot_table(
        index="period", columns="unit_id", values=outcome, aggfunc="mean"
    )
    pivot = pivot.reindex(periods)

    treated_pre = pivot.loc[pre_periods, treated_unit].to_numpy().astype(float)
    donor_pre = pivot.loc[pre_periods, donor_ids].to_numpy().astype(float)

    valid_donors = ~np.isnan(donor_pre).any(axis=0)
    donor_ids_clean = [donor_ids[i] for i in range(len(donor_ids)) if valid_donors[i]]
    donor_pre = donor_pre[:, valid_donors]

    from scipy.optimize import minimize

    def _loss(w: Any) -> float:
        synthetic = donor_pre @ w
        return float(np.sum((treated_pre - synthetic) ** 2))

    n_donors = len(donor_ids_clean)
    w0 = np.ones(n_donors) / n_donors
    bounds = [(0.0, 1.0)] * n_donors
    constraints = {"type": "eq", "fun": lambda w: np.sum(w) - 1.0}

    res = minimize(_loss, w0, method="SLSQP", bounds=bounds, constraints=constraints)
    w_star = res.x

    treated_full = pivot.loc[periods, treated_unit].to_numpy().astype(float)
    donor_full = pivot.loc[periods, donor_ids_clean].to_numpy().astype(float)
    counterfactual = donor_full @ w_star
    effect = treated_full - counterfactual

    pre_mspe = float(np.mean((treated_pre - donor_pre @ w_star) ** 2))
    post_mask = [p in post_periods for p in periods]
    att = float(np.mean(effect[np.array(post_mask)]))

    donor_weights = {
        uid: float(w_star[i])
        for i, uid in enumerate(donor_ids_clean)
        if w_star[i] > 1e-4
    }

    placebo_p = None
    if n_placebo_runs > 0 and len(donor_ids_clean) > 0:
        treated_ratio = _mspe_ratio(effect, pre_periods, post_periods, periods)
        more_extreme = 0
        for placebo_unit in donor_ids_clean[:n_placebo_runs]:
            placebo_pre = pivot.loc[pre_periods, placebo_unit].to_numpy().astype(float)
            other_donors = [d for d in donor_ids_clean if d != placebo_unit]
            placebo_donor_pre = (
                pivot.loc[pre_periods, other_donors].to_numpy().astype(float)
            )

            n_pd = len(other_donors)
            pw0 = np.ones(n_pd) / n_pd

            def _ploss(
                w: Any, _pp: Any = placebo_pre, _dp: Any = placebo_donor_pre
            ) -> float:
                return float(np.sum((_pp - _dp @ w) ** 2))

            pbounds = [(0.0, 1.0)] * n_pd
            pcons = {"type": "eq", "fun": lambda w: np.sum(w) - 1.0}
            pres = minimize(
                _ploss, pw0, method="SLSQP", bounds=pbounds, constraints=pcons
            )

            placebo_full = pivot.loc[periods, placebo_unit].to_numpy().astype(float)
            placebo_donor_full = (
                pivot.loc[periods, other_donors].to_numpy().astype(float)
            )
            placebo_effect = placebo_full - placebo_donor_full @ pres.x

            pr = _mspe_ratio(placebo_effect, pre_periods, post_periods, periods)
            if pr >= treated_ratio:
                more_extreme += 1

        placebo_p = (more_extreme + 1) / (n_placebo_runs + 1)

    summary_lines = [
        f"Synthetic Control: {treated_unit}",
        f"Pre-treatment periods: {len(pre_periods)}",
        f"Post-treatment periods: {len(post_periods)}",
        f"Donors used: {len(donor_weights)}",
        f"Pre-treatment MSPE: {pre_mspe:.6f}",
        f"ATT: {att:.4f}",
    ]
    if placebo_p is not None:
        summary_lines.append(f"Placebo p-value: {placebo_p:.4f}")

    return SyntheticControlResult(
        treated_unit=treated_unit,
        donor_weights=donor_weights,
        counterfactual=tuple(float(c) for c in counterfactual),
        observed=tuple(float(o) for o in treated_full),
        treatment_effect=tuple(float(e) for e in effect),
        att=att,
        periods=tuple(str(p) for p in periods),
        pre_treatment_mspe=pre_mspe,
        placebo_p_value=placebo_p,
        model_summary="\n".join(summary_lines),
    )

CLI

nyc311.cli

Command-line entrypoints for nyc311.

main

main(argv: Sequence[str] | None = None) -> int

Run the implemented fetch and complaint-topic export commands.

Source code in src/nyc311/cli/_main.py
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
def main(argv: Sequence[str] | None = None) -> int:
    """Run the implemented fetch and complaint-topic export commands."""
    parser = build_parser()
    args = parser.parse_args(list(argv) if argv is not None else None)
    if args.command == "topics":
        filters = build_service_request_filter(args)
        if args.format == "geojson" and not args.boundaries:
            parser.error("--boundaries is required when --format geojson is used.")

        run_topic_pipeline(
            args.source,
            args.complaint_type,
            geography=args.geography,
            filters=filters,
            top_n=args.top_n,
            output=Path(args.output),
            output_format=args.format,
            boundaries=args.boundaries,
        )
        return 0

    if args.command == "fetch":
        filters = build_service_request_filter(args)
        fetch_service_requests(
            filters=filters,
            socrata_config=build_socrata_config(args),
            output=Path(args.output),
        )
        return 0

    raise AssertionError(f"Unsupported command: {args.command}")