Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

AirNow metadata options and other updates #146

Draft
wants to merge 33 commits into
base: stable
Choose a base branch
from
Draft
Changes from 1 commit
Commits
Show all changes
33 commits
Select commit Hold shift + click to select a range
a4f364a
Start of new AirNow meta reader
zmoon Oct 30, 2023
3f6a4b5
Read v2 site meta file
zmoon Oct 30, 2023
a7259e0
doc
zmoon Oct 30, 2023
948152b
Column names
zmoon Oct 30, 2023
eb46947
today optional in merge func
zmoon Oct 30, 2023
2324ceb
`airnow` param not used in new version
zmoon Oct 30, 2023
9780999
Silently don't use s3 for Python 3.6
zmoon Oct 31, 2023
29c3960
some cleanup
zmoon Oct 31, 2023
7257997
Use S3 for AirNow data files
zmoon Oct 31, 2023
53394fb
docs work
zmoon Oct 31, 2023
fa02513
docs work
zmoon Oct 31, 2023
f794b7b
Save the today meta df
zmoon Oct 31, 2023
8d1818f
More doc
zmoon Oct 31, 2023
c3eaf7b
Let exception be raised instead of returning empty df
zmoon Oct 31, 2023
c31c2cf
note
zmoon Oct 31, 2023
d56262b
Various updates
zmoon Oct 31, 2023
cce9cd8
Ignore a numpy deprecation warning from pandas
zmoon Oct 31, 2023
e8d4c7b
Deal with some warnings
zmoon Nov 1, 2023
13e2ae4
doc dates
zmoon Nov 1, 2023
6fba9a8
Dask option for getting multiple site metadata files
zmoon Nov 1, 2023
6cbedc6
Merge optional when getting site metadata for df
zmoon Nov 1, 2023
8dc7865
Pass through today metadata option
zmoon Nov 1, 2023
ee6d5c0
sp
zmoon Nov 1, 2023
5e42d8e
Use new func in old airnow section
zmoon Nov 1, 2023
252ea14
Set dtype for all
zmoon Nov 1, 2023
7710569
Towards AQS meta
zmoon Nov 1, 2023
34dba4c
Towards allowing using local EPA site metadata files
zmoon Nov 8, 2023
e5d9f1e
Load both files and create site IDs
zmoon Nov 8, 2023
2e1a740
Move to using new AQS fn
zmoon Nov 8, 2023
945cab7
Optionally include parameter list for AirNow meta
zmoon Nov 8, 2023
ccd4b88
doc
zmoon Nov 8, 2023
3885636
Merge remote-tracking branch 'noaa/develop' into airnow-metas
zmoon Nov 14, 2023
4ef1028
Towards better AQS site metadata df
zmoon Nov 15, 2023
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Prev Previous commit
Next Next commit
Various updates
  • Loading branch information
zmoon committed Oct 31, 2023

Unverified

This user has not yet uploaded their public signing key.
commit d56262bff0977a14beab8e6f537ee65e47dc3eaf
136 changes: 84 additions & 52 deletions monetio/obs/airnow.py
Original file line number Diff line number Diff line change
@@ -1,12 +1,34 @@
"""AirNow"""

import os
import re
import sys
from datetime import datetime
from pathlib import Path

import pandas as pd

_today_monitor_df = None
_hourly_cols = [
"date",
"time",
"siteid",
"site",
"utcoffset",
"variable",
"units",
"obs",
"source",
]
_daily_cols = [
"date",
"siteid",
"site",
"variable",
"units",
"obs",
"hours",
"source",
]
_savecols = [
"time",
"siteid",
@@ -24,6 +46,7 @@
"state_name",
"epa_region",
]
_today_monitor_df = None
_TFinder = None


@@ -45,7 +68,7 @@ def build_urls(dates, *, daily=False):

urls = []
fnames = []
print("Building AIRNOW URLs...")
print("Building AirNow URLs...")
if sys.version_info < (3, 7):
base_url = "https://s3-us-west-1.amazonaws.com/files.airnowtech.org/airnow/"
else:
@@ -68,13 +91,16 @@ def build_urls(dates, *, daily=False):
return urls, fnames


def read_csv(fn):
def read_csv(fn, *, daily=None):
"""Read an AirNow CSV file.

Parameters
----------
fn : str
fn
File to read, passed to :func:`pandas.read_csv`.
daily : bool, optional
Is this is a daily (``True``) or hourly (``False``) file?
By default, attempt to determine based on the file name.

Returns
-------
@@ -83,48 +109,60 @@ def read_csv(fn):
Additional processing done by :func:`aggregate_files` / :func:`add_data`
not applied.
"""
hourly_cols = [
"date",
"time",
"siteid",
"site",
"utcoffset",
"variable",
"units",
"obs",
"source",
]
daily_cols = ["date", "siteid", "site", "variable", "units", "obs", "hours", "source"]
dft = pd.read_csv(
from monetio.util import _get_pandas_version

pd_ver = _get_pandas_version()

if daily is None:
if isinstance(fn, Path):
fn_str = fn.as_posix()
else:
fn_str = str(fn)

if fn_str.endswith("daily_data.dat"):
daily = True
elif re.search(r"HourlyData_[0-9]{10}.dat", fn_str) is not None:
daily = False
else:
raise ValueError("Could not determine if file is daily or hourly")

dtype = {"siteid": str, "obs": float}
Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Need to ensure MSA code and such are read as str, not int/float. cmsa_name and msa_code seem to need this or they get convert to float64 (in order to represent missing with NaN).

if daily:
names = _daily_cols
else:
names = _hourly_cols
dtype.update(utcoffset=float)

if pd_ver < (1, 3):
on_bad = dict(
error_bad_lines=False,
warn_bad_lines=True,
)
else:
on_bad = dict(on_bad_lines="warn")

df = pd.read_csv(
fn,
delimiter="|",
header=None,
error_bad_lines=False,
warn_bad_lines=True,
names=names,
parse_dates=False,
dtype=dtype,
encoding="ISO-8859-1",
**on_bad,
)
# TODO: `error_bad_lines` is deprecated from v1.3 (use `on_bad_lines='warn'` instead)
# TODO: or hourly/daily option (to return proper empty df)?

# Assign column names
ncols = dft.columns.size
daily = False
if ncols == len(hourly_cols):
dft.columns = hourly_cols
elif ncols == len(hourly_cols) - 1: # daily data
daily = True
dft.columns = daily_cols
else:
raise Exception(f"unexpected number of columns: {ncols}")

dft["obs"] = dft.obs.astype(float)
# ^ TODO: could use smaller float type, provided precision is low
dft["siteid"] = dft.siteid.str.zfill(9)
# ^ TODO: does nothing; and some site IDs are longer (12) or start with letters
if not daily:
dft["utcoffset"] = dft.utcoffset.astype(int) # FIXME: some sites have fractional UTC offset
# TODO: pandas v2 path using `date_format`?

return dft
if daily:
df["time"] = pd.to_datetime(df["date"], format=r"%m/%d/%y", exact=True)
else:
df["time"] = pd.to_datetime(
df["date"] + " " + df["time"], format=r"%m/%d/%y %H:%M", exact=True
)
df = df.drop(columns=["date"])

return df


def retrieve(url, fname):
@@ -181,7 +219,7 @@ def aggregate_files(dates, *, download=False, n_procs=1, daily=False, bad_utcoff
import dask
import dask.dataframe as dd

print("Aggregating AIRNOW files...")
print("Aggregating AirNow files...")

urls, fnames = build_urls(dates, daily=daily)
if download:
@@ -193,23 +231,17 @@ def aggregate_files(dates, *, download=False, n_procs=1, daily=False, bad_utcoff
dff = dd.from_delayed(dfs)
df = dff.compute(num_workers=n_procs).reset_index()

# Datetime conversion
if daily:
df["time"] = pd.to_datetime(df.date, format=r"%m/%d/%y", exact=True)
else:
df["time"] = pd.to_datetime(
df.date + " " + df.time, format=r"%m/%d/%y %H:%M", exact=True
) # TODO: move to read_csv? (and some of this other stuff too?)
df["time_local"] = df.time + pd.to_timedelta(df.utcoffset, unit="H")
df.drop(["date"], axis=1, inplace=True)
# Add LT column
if not daily:
df["time_local"] = df["time"] + pd.to_timedelta(df["utcoffset"], unit="H")

print(" Adding in Meta-data")
print(" Adding in site metadata")
df = get_station_locations(df)
if daily:
df = df[[col for col in _savecols if col not in {"time_local", "utcoffset"}]]
else:
df = df[_savecols]
df.drop_duplicates(inplace=True)
df.drop_duplicates(inplace=True) # TODO: shouldn't be

df = filter_bad_values(df, bad_utcoffset=bad_utcoffset)

@@ -309,7 +341,7 @@ def filter_bad_values(df, *, max=3000, bad_utcoffset="drop"):
else:
raise ValueError("`bad_utcoffset` must be one of: 'null', 'drop', 'fix', 'leave'")

return df # TODO: dropna here (since it is called `filter_bad_values`)?
return df # TODO: optionally dropna here (since it is called `filter_bad_values`)?


def get_utcoffset(lat, lon):
49 changes: 49 additions & 0 deletions monetio/util.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,6 @@
from collections import namedtuple


def nearest(items, pivot):
return min(items, key=lambda x: abs(x - pivot))

@@ -378,6 +381,52 @@ def _import_required(mod_name: str):
) from e


_SimpleVersion = namedtuple("_SimpleVersion", ["major", "minor", "patch"])


def _parse_version(s):
"""Parse a version string into a ``_SimpleVersion`` tuple (major, minor, patch).

Uses ``packaging.version.parse`` if available, otherwise a simple parser.
"""
try:
from packaging.version import parse
except ImportError:
have_packaging = False
else:
have_packaging = True

if not have_packaging:
parts = s.split(".")

major = int(parts[0])
minor = int(parts[1])

if len(parts) >= 3:
patch_chars = []
for c in parts[2]:
if not c.isdigit():
break
patch_chars.append(c)
patch = int("".join(patch_chars))
else:
patch = 0
else:
v = parse(s)
major = v.major
minor = v.minor
patch = v.micro

return _SimpleVersion(major, minor, patch)


def _get_pandas_version():
"""Get pandas major, minor, and patch version from ``pandas.__version__``."""
import pandas as pd

return _parse_version(pd.__version__)


def _try_merge_exact(left, right, *, right_name=None):
"""For two ``xr.Dataset``s, try ``left.merge(right, compat="equals", join="exact")``.
If it fails, print informative debugging messages and re-raise.
10 changes: 10 additions & 0 deletions tests/test_airnow.py
Original file line number Diff line number Diff line change
@@ -96,3 +96,13 @@ def test_check_zero_utc_offsets(date, bad_utcoffset):
assert not df.utcoffset.isnull().any()
assert bad_sites.empty
assert ((df.utcoffset >= -12) & (df.utcoffset <= 14)).all()


def test_hourly_vs_daily_cols():
assert airnow._hourly_cols != airnow._daily_cols
hourly_col_set = set(airnow._hourly_cols)
daily_col_set = set(airnow._daily_cols)
assert len(hourly_col_set) == len(airnow._hourly_cols)
assert len(daily_col_set) == len(airnow._daily_cols)
assert hourly_col_set - daily_col_set == {"time", "utcoffset"}
assert daily_col_set - hourly_col_set == {"hours"}