iiasa · khaeru · Mar 9, 2026 · Feb 10, 2026 · Aug 27, 2025 · Jan 20, 2026
@@ -15,9 +15,12 @@ This work extends the water sector linkage described by Parkinson et al. (2019)
 CLI usage
 =========
 
-Use the :doc:`CLI </cli>` command ``mix-data water`` to invoke the commands defined in :mod:`.water.cli`. Example:
-``mix-models --url=ixmp://ixmp_dev/ENGAGE_SSP2_v4.1.7/baseline_clone_test water cooling``
-model and scenario specifications can be either set manually in ``cli.py`` or specified in the ``--url`` option
+Use the :doc:`CLI </cli>` command ``mix-models water-ix`` to invoke the commands defined in :mod:`.water.cli`.
+Model and scenario specifications can be set via the ``--url`` option or in ``cli.py``.
+
+Example::
+
+   mix-models --url=ixmp://ixmp_dev/ENGAGE_SSP2_v4.1.7/baseline_clone_test water-ix nexus
 
 .. code::
 
@@ -37,18 +40,37 @@ model and scenario specifications can be either set manually in ``cli.py`` or sp
    nexus    Add basin structure connected to the energy sector and water...
    report   function to run the water report_full from cli to the scenario...
 
+.. code::
+
+   Usage: mix-models water-ix nexus [OPTIONS]
+
+   Options:
+   --rcps [no_climate|6p0|2p6|7p0]   Climate scenario (default: no_climate).
+   --rels [low|med|high]              Hydrological data reliability (default: low).
+   --sdgs TEXT                        Water SDG measures (default: baseline).
+   --macro                            Solve with MESSAGE-MACRO.
+   --reduced-basin / --no-reduced-basin
+                                      Enable basin filtering (default: off).
+   --basin-selection [first_k|stress] Automatic selection method (default: first_k).
+                                      first_k: head n basins per region in CSV order.
+                                      stress: sample across demand/supply ratio spectrum.
+   --num-basins INTEGER               Basins per region (default: 3).
+   --filter-list TEXT                 Extra basins to add to the automatic selection
+                                      (repeatable). Final set is the union of automatic
+                                      selection and filter-list entries.
+
 Country vs Global implementation
 --------------------------------
 
 The :mod:`message_ix_models.model.water` is designed to being able to add water components to either a global R11 (or R12) model or any country model designed with `the MESSAGEix single country <https://github.com/iiasa/message_single_country>`_ model prototype.
-For any of the region configuration a shapefile is needed to run the pre-processing part, while, once the data is prepared, only a .csv file similar to those in `message_ix_models.data.water.delineation` is needed.
+For any of the region configuration a shapefile is needed to run the pre-processing part, while, once the data is prepared, only a .csv file similar to those in :file:`message_ix_models/data/water/delineation/` is needed.
 
 To work with a country model please ensure that:
 
 1. country model and scenario are specified either in ``--url`` or in the ``cli.py`` script
 2. the option ``--regions`` is used with the ISO3 code of the country (e.g. for Israel ``--regions=ISR``)
-3. Following the Israel example add a 'country'.yaml file in `message_ix_models.data.node` for the specific country
-4. Following the Israel example add the country ISO3 code in the 'regions' options in `message_ix_models.utils.click`
+3. Following the Israel example add a 'country'.yaml file in :file:`message_ix_models/data/node/` for the specific country
+4. Following the Israel example add the country ISO3 code in the 'regions' options in :mod:`message_ix_models.util.click`
 
 Annual vs sub-annual implementation
 -----------------------------------

diff --git a/doc/whatsnew.rst b/doc/whatsnew.rst
@@ -8,6 +8,11 @@ Next release
   released 2026-01-21 (:pull:`470`).
 - New module :mod:`tools.bilateralize <message_ix_models.tools.bilateralize>`
   to change scenarios to a bilateral representation of trade (:pull:`438`).
+
+- Add reduced basin filtering for water module with ``--reduced-basin`` and
+  demand/supply stress-based selection via ``--basin-selection stress``
+  (:pull:`432`, :issue:`414`).
+
 - Fix water module parameter bugs and refactor cooling (:pull:`405`):
   infrastructure M1/Mf mode fixes, regional average shares for cooling allocation,
   water supply level hierarchy corrections, and test suite improvements.

@@ -10,7 +10,7 @@
 from message_ix_models.model.structure import get_codes
 from message_ix_models.util import broadcast, package_data_path
 
-from .utils import read_config
+from .utils import filter_basins_by_region, read_config
 
 log = logging.getLogger(__name__)
 
@@ -556,6 +556,10 @@ def map_basin(context: Context) -> Mapping[str, ScenarioInfo]:
     PATH = package_data_path("water", "delineation", FILE)
 
     df = pd.read_csv(PATH)
+
+    # Apply basin filter to reduce number of basins per region
+    df = filter_basins_by_region(df, context)
+
     # Assigning proper nomenclature
     df["node"] = "B" + df["BCU_name"].astype(str)
     df["mode"] = "M" + df["BCU_name"].astype(str)
@@ -577,6 +581,8 @@ def map_basin(context: Context) -> Mapping[str, ScenarioInfo]:
     results["map_node"] = nodes
 
     context.all_nodes = df["node"]
+    # Store the filtered basin names for use in other functions
+    context.valid_basins = set(df["BCU_name"].astype(str))
 
     for set_name, config in results.items():
         # Sets to add

@@ -123,13 +123,52 @@ def water_ini(context: "Context", regions, time):
     is_flag=True,
     help="Defines whether the model solves with macro",
 )
+@click.option(
+    "--reduced-basin/--no-reduced-basin",
+    default=False,
+    help="Enable reduced basin filtering",
+)
+@click.option(
+    "--filter-list",
+    multiple=True,
+    help="Specific basins to include (can be used multiple times)",
+)
+@click.option(
+    "--num-basins",
+    type=int,
+    help="Number of basins per region to keep when reduced-basin is enabled",
+)
+@click.option(
+    "--basin-selection",
+    type=click.Choice(["first_k", "stress"]),
+    default="first_k",
+    help="Basin selection: first_k (CSV order) or stress (demand/supply span)",
+)
 @common_params("regions")
 @scenario_param("--ssp")
-def nexus_cli(context: "Context", regions, rcps, sdgs, rels, macro=False):
+def nexus_cli(
+    context: "Context",
+    regions,
+    rcps,
+    sdgs,
+    rels,
+    macro=False,
+    reduced_basin=False,
+    filter_list=None,
+    num_basins=None,
+    basin_selection="first_k",
+):
     """
     Add basin structure connected to the energy sector and
     water balance linking different water demands to supply.
     """
+    # Set basin filtering configuration on context
+    context.reduced_basin = reduced_basin
+    if filter_list:
+        context.filter_list = list(filter_list)
+    if num_basins is not None:
+        context.num_basins = num_basins
+    context.basin_selection = basin_selection
 
     nexus(context, regions, rcps, sdgs, rels, macro)
 

@@ -201,6 +201,8 @@ def add_sectoral_demands(context: "Context") -> dict[str, pd.DataFrame]:
     for key, df in d.items():
         df.rename(columns={"Unnamed: 0": "year"}, inplace=True)
         df.set_index("year", inplace=True)
+        # Cast column index from StringDtype to object for xarray compatibility
+        df.columns = df.columns.astype(object)
         dfs[key] = df
 
     # convert the dictionary of dataframes to xarray
@@ -217,6 +219,9 @@ def add_sectoral_demands(context: "Context") -> dict[str, pd.DataFrame]:
 
     df_dmds["time"] = "year"
 
+    # Filter to only include basins that exist after basin filtering
+    df_dmds = df_dmds[df_dmds["node"].isin(context.valid_basins)]
+
     # Write final interpolated values as csv
     # df2_f.to_csv('final_interpolated_values.csv')
 
@@ -239,6 +244,9 @@ def add_sectoral_demands(context: "Context") -> dict[str, pd.DataFrame]:
         df_m = df_m[["year", "pid", "variable", "value", "month"]]
         df_m.columns = pd.Index(["year", "node", "variable", "value", "time"])
 
+        # Filter monthly data to only include valid basins
+        df_m = df_m[df_m["node"].isin(context.valid_basins)]
+
         # remove yearly parts from df_dms
         df_dmds = df_dmds[
             ~df_dmds["variable"].isin(
@@ -768,12 +776,10 @@ def read_water_availability(context: "Context") -> Sequence[pd.DataFrame]:
     )
     df_x = pd.read_csv(PATH)
 
+    # Filter to only include valid basins
+    df_x = df_x[df_x["BCU_name"].isin(context.valid_basins)]
+
     if "year" in context.time:
-        # path for reading basin delineation file
-        PATH = package_data_path(
-            "water", "delineation", f"basins_by_region_simpl_{context.regions}.csv"
-        )
-        df_x = pd.read_csv(PATH)
         # Adding freshwater supply constraints
         # Reading data, the data is spatially and temprally aggregated from GHMs
         path1 = package_data_path(
@@ -785,6 +791,14 @@ def read_water_availability(context: "Context") -> Sequence[pd.DataFrame]:
         df_sw = pd.read_csv(path1)
         df_sw.drop(["Unnamed: 0"], axis=1, inplace=True)
 
+        # Filter rows to valid basins using index positions from full list
+        full_basin_df = pd.read_csv(PATH)
+        valid_indices = full_basin_df[
+            full_basin_df["BCU_name"].isin(context.valid_basins)
+        ].index
+        df_sw = df_sw.iloc[valid_indices]  # Keep only rows for valid basins
+        df_sw.reset_index(drop=True, inplace=True)
+
         df_sw.index = df_x["BCU_name"].index
         df_sw = df_sw.stack().reset_index()
         df_sw.columns = pd.Index(["Region", "years", "value"])
@@ -809,6 +823,11 @@ def read_water_availability(context: "Context") -> Sequence[pd.DataFrame]:
         # Read groundwater data
         df_gw = pd.read_csv(path1)
         df_gw.drop(["Unnamed: 0"], axis=1, inplace=True)
+
+        # Filter to only include valid basins (same as df_sw)
+        df_gw = df_gw.iloc[valid_indices]  # Use same valid_indices from above
+        df_gw.reset_index(drop=True, inplace=True)
+
         df_gw.index = df_x["BCU_name"].index
         df_gw = df_gw.stack().reset_index()
         df_gw.columns = pd.Index(["Region", "years", "value"])
@@ -833,6 +852,14 @@ def read_water_availability(context: "Context") -> Sequence[pd.DataFrame]:
         df_sw = pd.read_csv(path1)
         df_sw.drop(["Unnamed: 0"], axis=1, inplace=True)
 
+        # Filter rows to valid basins
+        full_basin_df = pd.read_csv(PATH)
+        valid_indices = full_basin_df[
+            full_basin_df["BCU_name"].isin(context.valid_basins)
+        ].index
+        df_sw = df_sw.iloc[valid_indices]
+        df_sw.reset_index(drop=True, inplace=True)
+
         df_sw.index = df_x["BCU_name"].index
         df_sw = df_sw.stack().reset_index()
         df_sw.columns = pd.Index(["Region", "years", "value"])
@@ -856,6 +883,10 @@ def read_water_availability(context: "Context") -> Sequence[pd.DataFrame]:
         df_gw = pd.read_csv(path1)
         df_gw.drop(["Unnamed: 0"], axis=1, inplace=True)
 
+        # Filter to only include valid basins (same as df_sw)
+        df_gw = df_gw.iloc[valid_indices]  # Use same valid_indices from above
+        df_gw.reset_index(drop=True, inplace=True)
+
         df_gw.index = df_x["BCU_name"].index
         df_gw = df_gw.stack().reset_index()
         df_gw.columns = pd.Index(["Region", "years", "value"])

@@ -291,6 +291,10 @@ def add_infrastructure_techs(context: "Context") -> dict[str, pd.DataFrame]:
     PATH = package_data_path("water", "delineation", FILE2)
 
     df_node = pd.read_csv(PATH)
+
+    # Filter to only valid basins (already filtered in map_basin)
+    df_node = df_node[df_node["BCU_name"].isin(context.valid_basins)]
+
     # Assigning proper nomenclature
     df_node["node"] = "B" + df_node["BCU_name"].astype(str)
     df_node["mode"] = "M" + df_node["BCU_name"].astype(str)
@@ -807,6 +811,10 @@ def add_desalination(context: "Context") -> dict[str, pd.DataFrame]:
     PATH = package_data_path("water", "delineation", FILE2)
 
     df_node = pd.read_csv(PATH)
+
+    # Filter to only valid basins (already filtered in map_basin)
+    df_node = df_node[df_node["BCU_name"].isin(context.valid_basins)]
+
     # Assigning proper nomenclature
     df_node["node"] = "B" + df_node["BCU_name"].astype(str)
     df_node["mode"] = "M" + df_node["BCU_name"].astype(str)
@@ -815,6 +823,11 @@ def add_desalination(context: "Context") -> dict[str, pd.DataFrame]:
         if context.type_reg == "country"
         else f"{context.regions}_" + df_node["REGION"].astype(str)
     )
+
+    # Filter to basins that exist after filtering
+    df_hist = df_hist[df_hist["BCU_name"].isin(context.valid_basins)]
+    df_proj = df_proj[df_proj["BCU_name"].isin(context.valid_basins)]
+
     # output dataframe linking to desal tech types
     out_df = (
         make_df(

@@ -32,6 +32,10 @@ def add_irr_structure(context: "Context") -> dict[str, pd.DataFrame]:
     FILE2 = f"basins_by_region_simpl_{context.regions}.csv"
     PATH = package_data_path("water", "delineation", FILE2)
     df_node = pd.read_csv(PATH)
+
+    # Filter to only include valid basins
+    df_node = df_node[df_node["BCU_name"].isin(context.valid_basins)]
+
     # Assigning proper nomenclature
     df_node["node"] = "B" + df_node["BCU_name"].astype(str)
     df_node["mode"] = "M" + df_node["BCU_name"].astype(str)

@@ -54,7 +54,7 @@ def _load_scenario_and_cooling_data(
         "water", "delineation", f"basins_by_region_simpl_{context.regions}.csv"
     )
 
-    # Load basin delineation
+    # FIXME Derive node_region from scenario/codelist rather than basin CSV
     df_node = pd.read_csv(basin_path)
     df_node["node"] = "B" + df_node["BCU_name"].astype(str)
     df_node["mode"] = "M" + df_node["BCU_name"].astype(str)
@@ -121,9 +121,11 @@ def _compute_cooling_rates(input_cool: pd.DataFrame) -> pd.DataFrame:
 
     # Cooling fraction: heat to be rejected
     input_cool["cooling_fraction"] = input_cool.apply(
-        lambda r: r["value"] - 1
-        if "hpl" in str(r.get("parent_tech", ""))
-        else r["value"] * (1 - flue_loss) - 1,
+        lambda r: (
+            r["value"] - 1
+            if "hpl" in str(r.get("parent_tech", ""))
+            else r["value"] * (1 - flue_loss) - 1
+        ),
         axis=1,
     )