Skip to content
Merged
122 changes: 122 additions & 0 deletions collect_parquet_builtin.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,122 @@
from pathlib import Path
import json

# Currently requires a special branch of pyarrow with extra GeoArrow features
# https://github.com/apache/arrow/compare/main...paleolimbot:arrow:parquet-geo-write-files-from-geoarrow
import pyarrow as pa
from pyarrow import parquet
import geoarrow.pyarrow as ga

here = Path(__file__).parent


def list_wkb_files():
wkb_files = []
with open(here / "manifest.json") as f:
manifest = json.load(f)
for group in manifest["groups"]:
for file in group["files"]:
if file["format"] == "arrows/wkb":
name = Path(file["url"]).name
local_path = here / group["name"] / "files" / name
assert local_path.exists()
wkb_files.append(local_path)

return wkb_files


def convert_arrow_wkb_to_parquet(src, dst, compression):
# Maintain chunking from IPC into Parquet so that the statistics
# are theoretically the same.
with (
pa.ipc.open_stream(src) as reader,
parquet.ParquetWriter(
dst,
reader.schema,
store_schema=False,
compression=compression,
write_geospatial_logical_types=True,
) as writer,
):
print(f"Reading {src}")
for batch in reader:
writer.write_batch(batch)
print(f"Wrote {dst}")


def check_parquet_file(src, dst):
# Read in original table for comparison
with pa.ipc.open_stream(src) as reader:
original_table = reader.read_all()

print(f"Checking {dst}")
# with parquet.ParquetFile(dst, arrow_extensions_enabled=False) as f:
# print(f.schema)
# print(f.metadata.metadata)
with parquet.ParquetFile(dst, arrow_extensions_enabled=True) as f:
# print(f.schema)
# print(f.metadata.metadata)
if f.schema_arrow != original_table.schema:
print(f"Schema mismatch:\n{f.schema_arrow}\nvs\n{original_table.schema}")
return False

reread = f.read()
if reread != original_table:
print("Table mismatch")
return False

return True


def generate_parquet_testing_files(wkb_files, parquet_testing_path):
successful_checks = 0
written_files = 0
for path in wkb_files:
# Skip big files + one CRS example that includes a non-PROJJSON value
# on purpose (allowed in GeoArrow), which is rightly rejected
# by Parquet
name = path.name.replace("_wkb.arrows", "")
if (
"microsoft-buildings" in name
or ("ns-water" in name and name != "ns-water_water-point")
or "wkt2" in name
):
print(f"Skipping {name}")
continue

dst = parquet_testing_path / f"{name}.parquet"
convert_arrow_wkb_to_parquet(path, dst, compression="none")
written_files += 1
successful_checks += check_parquet_file(path, dst)

if successful_checks != written_files:
raise ValueError("Some checks failed when generating testing files")


def generate_geoarrow_data_parquet_files(wkb_files):
successful_checks = 0
written_files = 0
for path in wkb_files:
name = path.name.replace("_wkb.arrows", "")
if "wkt2" in name:
print(f"Skipping {name}")
continue
if name.startswith("ns-water") or name.startswith("microsoft"):
compression = "zstd"
else:
compression = "none"

dst = path.parent / f"{name}.parquet"
convert_arrow_wkb_to_parquet(path, dst, compression=compression)
written_files += 1
successful_checks += check_parquet_file(path, dst)

if successful_checks != written_files:
raise ValueError("Some checks failed when generating testing files")


if __name__ == "__main__":
parquet_testing_path = here.parent / "parquet-testing" / "data" / "geospatial"
wkb_files = list_wkb_files()
generate_parquet_testing_files(wkb_files, parquet_testing_path)
generate_geoarrow_data_parquet_files(wkb_files)
14 changes: 7 additions & 7 deletions example-crs/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -17,10 +17,10 @@ All versions of Natural Earth map data redistributed from this repository are in

## Files

- vermont-crs84 ([arrows/wkb](https://raw.githubusercontent.com/geoarrow/geoarrow-data/v0.2.0-rc4/example-crs/files/example-crs_vermont-crs84_wkb.arrows), [geoparquet](https://raw.githubusercontent.com/geoarrow/geoarrow-data/v0.2.0-rc4/example-crs/files/example-crs_vermont-crs84.parquet), [fgb](https://raw.githubusercontent.com/geoarrow/geoarrow-data/v0.2.0-rc4/example-crs/files/example-crs_vermont-crs84.fgb))
- vermont-4326 ([arrows/wkb](https://raw.githubusercontent.com/geoarrow/geoarrow-data/v0.2.0-rc4/example-crs/files/example-crs_vermont-4326_wkb.arrows), [geoparquet](https://raw.githubusercontent.com/geoarrow/geoarrow-data/v0.2.0-rc4/example-crs/files/example-crs_vermont-4326.parquet), [fgb](https://raw.githubusercontent.com/geoarrow/geoarrow-data/v0.2.0-rc4/example-crs/files/example-crs_vermont-4326.fgb))
- vermont-utm ([arrows/wkb](https://raw.githubusercontent.com/geoarrow/geoarrow-data/v0.2.0-rc4/example-crs/files/example-crs_vermont-utm_wkb.arrows), [geoparquet](https://raw.githubusercontent.com/geoarrow/geoarrow-data/v0.2.0-rc4/example-crs/files/example-crs_vermont-utm.parquet), [fgb](https://raw.githubusercontent.com/geoarrow/geoarrow-data/v0.2.0-rc4/example-crs/files/example-crs_vermont-utm.fgb))
- vermont-custom ([arrows/wkb](https://raw.githubusercontent.com/geoarrow/geoarrow-data/v0.2.0-rc4/example-crs/files/example-crs_vermont-custom_wkb.arrows), [geoparquet](https://raw.githubusercontent.com/geoarrow/geoarrow-data/v0.2.0-rc4/example-crs/files/example-crs_vermont-custom.parquet), [fgb](https://raw.githubusercontent.com/geoarrow/geoarrow-data/v0.2.0-rc4/example-crs/files/example-crs_vermont-custom.fgb))
- vermont-crs84-wkt2 ([arrows/wkb](https://raw.githubusercontent.com/geoarrow/geoarrow-data/v0.2.0-rc4/example-crs/files/example-crs_vermont-crs84-wkt2_wkb.arrows))
- vermont-crs84-auth-code ([arrows/wkb](https://raw.githubusercontent.com/geoarrow/geoarrow-data/v0.2.0-rc4/example-crs/files/example-crs_vermont-crs84-auth-code_wkb.arrows))
- vermont-crs84-unknown ([arrows/wkb](https://raw.githubusercontent.com/geoarrow/geoarrow-data/v0.2.0-rc4/example-crs/files/example-crs_vermont-crs84-unknown_wkb.arrows))
- vermont-crs84 ([arrows/wkb](https://raw.githubusercontent.com/geoarrow/geoarrow-data/v0.2.0-rc5/example-crs/files/example-crs_vermont-crs84_wkb.arrows), [geoparquet](https://raw.githubusercontent.com/geoarrow/geoarrow-data/v0.2.0-rc5/example-crs/files/example-crs_vermont-crs84_geo.parquet), [fgb](https://raw.githubusercontent.com/geoarrow/geoarrow-data/v0.2.0-rc5/example-crs/files/example-crs_vermont-crs84.fgb), [parquet](https://raw.githubusercontent.com/geoarrow/geoarrow-data/v0.2.0-rc5/example-crs/files/example-crs_vermont-crs84.parquet))
- vermont-4326 ([arrows/wkb](https://raw.githubusercontent.com/geoarrow/geoarrow-data/v0.2.0-rc5/example-crs/files/example-crs_vermont-4326_wkb.arrows), [geoparquet](https://raw.githubusercontent.com/geoarrow/geoarrow-data/v0.2.0-rc5/example-crs/files/example-crs_vermont-4326_geo.parquet), [fgb](https://raw.githubusercontent.com/geoarrow/geoarrow-data/v0.2.0-rc5/example-crs/files/example-crs_vermont-4326.fgb), [parquet](https://raw.githubusercontent.com/geoarrow/geoarrow-data/v0.2.0-rc5/example-crs/files/example-crs_vermont-4326.parquet))
- vermont-utm ([arrows/wkb](https://raw.githubusercontent.com/geoarrow/geoarrow-data/v0.2.0-rc5/example-crs/files/example-crs_vermont-utm_wkb.arrows), [geoparquet](https://raw.githubusercontent.com/geoarrow/geoarrow-data/v0.2.0-rc5/example-crs/files/example-crs_vermont-utm_geo.parquet), [fgb](https://raw.githubusercontent.com/geoarrow/geoarrow-data/v0.2.0-rc5/example-crs/files/example-crs_vermont-utm.fgb), [parquet](https://raw.githubusercontent.com/geoarrow/geoarrow-data/v0.2.0-rc5/example-crs/files/example-crs_vermont-utm.parquet))
- vermont-custom ([arrows/wkb](https://raw.githubusercontent.com/geoarrow/geoarrow-data/v0.2.0-rc5/example-crs/files/example-crs_vermont-custom_wkb.arrows), [geoparquet](https://raw.githubusercontent.com/geoarrow/geoarrow-data/v0.2.0-rc5/example-crs/files/example-crs_vermont-custom_geo.parquet), [fgb](https://raw.githubusercontent.com/geoarrow/geoarrow-data/v0.2.0-rc5/example-crs/files/example-crs_vermont-custom.fgb), [parquet](https://raw.githubusercontent.com/geoarrow/geoarrow-data/v0.2.0-rc5/example-crs/files/example-crs_vermont-custom.parquet))
- vermont-crs84-wkt2 ([arrows/wkb](https://raw.githubusercontent.com/geoarrow/geoarrow-data/v0.2.0-rc5/example-crs/files/example-crs_vermont-crs84-wkt2_wkb.arrows))
- vermont-crs84-auth-code ([arrows/wkb](https://raw.githubusercontent.com/geoarrow/geoarrow-data/v0.2.0-rc5/example-crs/files/example-crs_vermont-crs84-auth-code_wkb.arrows), [parquet](https://raw.githubusercontent.com/geoarrow/geoarrow-data/v0.2.0-rc5/example-crs/files/example-crs_vermont-crs84-auth-code.parquet))
- vermont-crs84-unknown ([arrows/wkb](https://raw.githubusercontent.com/geoarrow/geoarrow-data/v0.2.0-rc5/example-crs/files/example-crs_vermont-crs84-unknown_wkb.arrows), [parquet](https://raw.githubusercontent.com/geoarrow/geoarrow-data/v0.2.0-rc5/example-crs/files/example-crs_vermont-crs84-unknown.parquet))
8 changes: 4 additions & 4 deletions example-crs/collect.py
Original file line number Diff line number Diff line change
Expand Up @@ -35,7 +35,7 @@ def write_parquet(lazy=True):
pd = None

for crs, label in zip(CRSES, CRS_LABELS):
out = here / "files" / f"example-crs_vermont-{label}.parquet"
out = here / "files" / f"example-crs_vermont-{label}_geo.parquet"
if lazy and out.exists():
continue

Expand All @@ -55,7 +55,7 @@ def write_geoarrow():
out = here / "files" / f"example-crs_vermont-{label}_wkb.arrows"

tab = io.read_geoparquet_table(
here / "files" / f"example-crs_vermont-{label}.parquet"
here / "files" / f"example-crs_vermont-{label}_geo.parquet"
)

# Ensure we write PROJJSON explicitly for these examples. Probably
Expand All @@ -71,7 +71,7 @@ def write_fgb():
out = here / "files" / f"example-crs_vermont-{label}.fgb"

tab = io.read_geoparquet_table(
here / "files" / f"example-crs_vermont-{label}.parquet"
here / "files" / f"example-crs_vermont-{label}_geo.parquet"
)

# geoarrow-rust needs "native" and not WKB-encoding
Expand All @@ -81,7 +81,7 @@ def write_fgb():


def write_geoarrow_alternative_crses():
tab = io.read_geoparquet_table(here / "files" / "example-crs_vermont-crs84.parquet")
tab = io.read_geoparquet_table(here / "files" / "example-crs_vermont-crs84_geo.parquet")

# Construct these metadatas by hand since that's the whole point of this data
extension_metadata = {
Expand Down
Binary file modified example-crs/files/example-crs_vermont-4326.parquet
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file modified example-crs/files/example-crs_vermont-crs84.parquet
Binary file not shown.
Binary file not shown.
Binary file modified example-crs/files/example-crs_vermont-custom.parquet
Binary file not shown.
Binary file not shown.
Binary file modified example-crs/files/example-crs_vermont-utm.parquet
Binary file not shown.
Binary file not shown.
3 changes: 2 additions & 1 deletion example-crs/manifest.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -3,14 +3,15 @@ format:
- arrows/wkb
- geoparquet
- fgb
- parquet
file_location: repo
files:
- name: vermont-crs84
- name: vermont-4326
- name: vermont-utm
- name: vermont-custom
- name: vermont-crs84-wkt2
skip_format: [geoparquet, fgb]
skip_format: [geoparquet, fgb, parquet]
- name: vermont-crs84-auth-code
skip_format: [geoparquet, fgb]
- name: vermont-crs84-unknown
Expand Down
Loading