Skip to content
Merged
Show file tree
Hide file tree
Changes from 7 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Binary file added test_data/data-linestring-encoding_native.parquet
Binary file not shown.
Binary file added test_data/data-linestring-encoding_wkb.parquet
Binary file not shown.
4 changes: 4 additions & 0 deletions test_data/data-linestring-wkt.csv
Original file line number Diff line number Diff line change
@@ -0,0 +1,4 @@
"col","geometry"
0,"LINESTRING (30 10, 10 30, 40 40)"
1,"LINESTRING EMPTY"
2,
Binary file not shown.
Binary file not shown.
5 changes: 5 additions & 0 deletions test_data/data-multilinestring-wkt.csv
Original file line number Diff line number Diff line change
@@ -0,0 +1,5 @@
"col","geometry"
0,"MULTILINESTRING ((30 10, 10 30, 40 40))"
1,"MULTILINESTRING ((10 10, 20 20, 10 40), (40 40, 30 30, 40 20, 30 10))"
2,"MULTILINESTRING EMPTY"
3,
Binary file added test_data/data-multipoint-encoding_native.parquet
Binary file not shown.
Binary file added test_data/data-multipoint-encoding_wkb.parquet
Binary file not shown.
5 changes: 5 additions & 0 deletions test_data/data-multipoint-wkt.csv
Original file line number Diff line number Diff line change
@@ -0,0 +1,5 @@
"col","geometry"
0,"MULTIPOINT ((30 10))"
1,"MULTIPOINT ((10 40), (40 30), (20 20), (30 10))"
2,"MULTIPOINT EMPTY"
3,
Binary file not shown.
Binary file added test_data/data-multipolygon-encoding_wkb.parquet
Binary file not shown.
6 changes: 6 additions & 0 deletions test_data/data-multipolygon-wkt.csv
Original file line number Diff line number Diff line change
@@ -0,0 +1,6 @@
"col","geometry"
0,"MULTIPOLYGON (((30 10, 40 40, 20 40, 10 20, 30 10)))"
1,"MULTIPOLYGON (((30 20, 45 40, 10 40, 30 20)), ((15 5, 40 10, 10 20, 5 10, 15 5)))"
2,"MULTIPOLYGON (((40 40, 20 45, 45 30, 40 40)), ((20 35, 10 30, 10 10, 30 5, 45 20, 20 35), (30 20, 20 15, 20 25, 30 20)))"
3,"MULTIPOLYGON EMPTY"
4,
Binary file added test_data/data-point-encoding_native.parquet
Binary file not shown.
Binary file added test_data/data-point-encoding_wkb.parquet
Binary file not shown.
5 changes: 5 additions & 0 deletions test_data/data-point-wkt.csv
Original file line number Diff line number Diff line change
@@ -0,0 +1,5 @@
"col","geometry"
0,"POINT (30 10)"
1,"POINT EMPTY"
2,
3,"POINT (40 40)"
Binary file added test_data/data-polygon-encoding_native.parquet
Binary file not shown.
Binary file added test_data/data-polygon-encoding_wkb.parquet
Binary file not shown.
5 changes: 5 additions & 0 deletions test_data/data-polygon-wkt.csv
Original file line number Diff line number Diff line change
@@ -0,0 +1,5 @@
"col","geometry"
0,"POLYGON ((30 10, 40 40, 20 40, 10 20, 30 10))"
1,"POLYGON ((35 10, 45 45, 15 40, 10 20, 35 10), (20 30, 35 35, 30 20, 20 30))"
2,"POLYGON EMPTY"
3,
206 changes: 206 additions & 0 deletions test_data/generate_test_data.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,206 @@
"""
Generates example data using pyarrow by running `python generate_test_data.py`.

You can print the metadata with:

.. code-block:: python

>>> import json, pprint, pyarrow.parquet as pq
>>> pprint.pprint(json.loads(pq.read_schema("example.parquet").metadata[b"geo"]))
"""
import json
import pathlib
import copy

import pyarrow as pa
import pyarrow.parquet as pq
from pyarrow.csv import write_csv

from shapely import from_wkt, to_wkb


HERE = pathlib.Path(__file__).parent


metadata_template = {
"version": "1.1.0",
"primary_column": "geometry",
"columns": {
"geometry": {
"encoding": "WKB",
"geometry_types": [],
},
},
}


## Various geometry types with WKB and native (GeoArrow-based) encodings

def write_encoding_files(geometries_wkt, geometries_geoarrow, geometry_type):
Comment thread
jorisvandenbossche marked this conversation as resolved.

table = pa.table({"col": range(len(geometries_wkt)), "geometry": geometries_wkt})
write_csv(table, HERE / f"data-{geometry_type.lower()}-wkt.csv")

# WKB encoding
table = pa.table(
{"col": range(len(geometries_wkt)), "geometry": to_wkb(from_wkt(geometries_wkt))}
)
metadata = copy.deepcopy(metadata_template)
metadata["columns"]["geometry"]["geometry_types"] = [geometry_type]
table = table.replace_schema_metadata({"geo": json.dumps(metadata)})
pq.write_table(table, HERE / f"data-{geometry_type.lower()}-encoding_wkb.parquet")
Comment thread
jorisvandenbossche marked this conversation as resolved.

# native (geoarrow) encoding
table = pa.table(
{"col": range(len(geometries_wkt)), "geometry": geometries_geoarrow}
)
metadata["columns"]["geometry"]["encoding"] = geometry_type.lower()
Comment thread
jorisvandenbossche marked this conversation as resolved.
table = table.replace_schema_metadata({"geo": json.dumps(metadata)})
pq.write_table(table, HERE / f"data-{geometry_type.lower()}-encoding_native.parquet")
Comment thread
jorisvandenbossche marked this conversation as resolved.


# point

geometries_wkt = [
"POINT (30 10)",
"POINT EMPTY",
None,
"POINT (40 40)",
]
Copy link
Copy Markdown
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Should there also be a version of these that contain NULLs or a version that contains Z values?

Copy link
Copy Markdown
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Added some null values!


point_type = pa.struct(
[
pa.field("x", pa.float64(), nullable=False),
pa.field("y", pa.float64(), nullable=False)
]
)
geometries = pa.array([(30, 10), (float("nan"), float("nan")), (float("nan"), float("nan")), (40, 40)], type=point_type)

write_encoding_files(
geometries_wkt, geometries, geometry_type="Point"
Comment thread
jorisvandenbossche marked this conversation as resolved.
)

# linestring

geometries_wkt = [
"LINESTRING (30 10, 10 30, 40 40)",
"LINESTRING EMPTY",
None
]

linestring_type = pa.list_(pa.field("vertices", point_type, nullable=False))
geometries = pa.array(
[[(30, 10), (10, 30), (40, 40)], [], []], type=linestring_type)

write_encoding_files(
geometries_wkt, geometries, geometry_type="LineString"
Comment thread
jorisvandenbossche marked this conversation as resolved.
)

# polygon

geometries_wkt = [
"POLYGON ((30 10, 40 40, 20 40, 10 20, 30 10))",
"POLYGON ((35 10, 45 45, 15 40, 10 20, 35 10), (20 30, 35 35, 30 20, 20 30))",
"POLYGON EMPTY",
None,
]

polygon_type = pa.list_(
pa.field("rings", pa.list_(
pa.field("vertices", point_type, nullable=False)
), nullable=False)
)
geometries = pa.array(
[
[[(30, 10), (40, 40), (20, 40), (10, 20), (30, 10)]],
[[(35, 10), (45, 45), (15, 40), (10, 20), (35, 10)],
[(20, 30), (35, 35), (30, 20), (20, 30)]],
[],
[],
],
type=polygon_type
)

write_encoding_files(
geometries_wkt, geometries, geometry_type="Polygon"
Comment thread
jorisvandenbossche marked this conversation as resolved.
)

# multipoint

geometries_wkt = [
"MULTIPOINT ((30 10))",
"MULTIPOINT ((10 40), (40 30), (20 20), (30 10))",
"MULTIPOINT EMPTY",
None,
]

multipoint_type = pa.list_(pa.field("points", point_type, nullable=False))
geometries = pa.array(
[
[(30, 10)],
[(10, 40), (40, 30), (20, 20), (30, 10)],
[],
[],
],
type=multipoint_type
)

write_encoding_files(
geometries_wkt, geometries, geometry_type="MultiPoint"
Comment thread
jorisvandenbossche marked this conversation as resolved.
)

# multilinestring

geometries_wkt = [
"MULTILINESTRING ((30 10, 10 30, 40 40))",
"MULTILINESTRING ((10 10, 20 20, 10 40), (40 40, 30 30, 40 20, 30 10))",
"MULTILINESTRING EMPTY",
None,
]

multilinestring_type = pa.list_(
pa.field("linestrings", linestring_type, nullable=False)
)
geometries = pa.array(
[
[[(30, 10), (10, 30), (40, 40)]],
[[(10, 10), (20, 20), (10, 40)],
[(40, 40), (30, 30), (40, 20), (30, 10)]],
[],
[],
],
type=multilinestring_type
)

write_encoding_files(
geometries_wkt, geometries, geometry_type="MultiLineString"
Comment thread
jorisvandenbossche marked this conversation as resolved.
)

# multipolygon

geometries_wkt = [
"MULTIPOLYGON (((30 10, 40 40, 20 40, 10 20, 30 10)))",
"MULTIPOLYGON (((30 20, 45 40, 10 40, 30 20)), ((15 5, 40 10, 10 20, 5 10, 15 5)))",
"MULTIPOLYGON (((40 40, 20 45, 45 30, 40 40)), ((20 35, 10 30, 10 10, 30 5, 45 20, 20 35), (30 20, 20 15, 20 25, 30 20)))",
"MULTIPOLYGON EMPTY",
None,
]

multipolygon_type = pa.list_(pa.field("polygons", polygon_type, nullable=False))
geometries = pa.array(
[
[[[(30, 10), (40, 40), (20, 40), (10, 20), (30, 10)]]],
[[[(30, 20), (45, 40), (10, 40), (30, 20)]],
[[(15, 5), (40, 10), (10, 20), (5, 10), (15, 5)]]],
[[[(40, 40), (20, 45), (45, 30), (40, 40)]],
[[(20, 35), (10, 30), (10, 10), (30, 5), (45, 20), (20, 35)],
[(30, 20), (20, 15), (20, 25), (30, 20)]]],
[],
[],
],
type=multipolygon_type
)

write_encoding_files(
geometries_wkt, geometries, geometry_type="MultiPolygon"
Comment thread
jorisvandenbossche marked this conversation as resolved.
)