VAT 4 Machine Learning - Creating Training data for a species distribution model

++ Currently, the examples are being reworked after the latest update because GBIF behaves differently now. Find out more. ++

This workflow is a contribution to the NFDI4Earth conference.

This workflow is a contribution to the NFDI4Earth conference. This workflow therefore uses the frequency of Arnica montana occurrences from GBIF as a target variable together with weather data from CHELSA, land use classification from the Ökosystematlas and topographic information as predictor variables to create a species distribution model for Arnica montana across Germany.

Import

#Import Packages
import geoengine as ge
from datetime import datetime 
from sklearn.model_selection import train_test_split
from sklearn.metrics import r2_score
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import GridSearchCV
import matplotlib.pyplot as plt
import xarray as xr
import numpy as np
import asyncio
import nest_asyncio
#Initialize Geo Engine in VAT
ge.initialize("https://vat.gfbio.org/api")
#Get the GBIF DataProvider id (useful for translating the DataProvider name to its id)
root_collection = ge.layer_collection()
gbif_prov_id = ''
for elem in root_collection.items:
    if elem.name == 'GBIF':
        gbif_prov_id = str(elem.provider_id)
        
gbif_prov_id
'1c01dbb9-e3ab-f9a2-06f5-228ba4b6bf7a'

Create Labelled Data

This chapter shows how to register the workflow for generating training data and how to manipulate this data to generate training data.

#Tuning parameters
start_time = datetime.strptime('2001-01-01T12:00:00.000Z', "%Y-%m-%dT%H:%M:%S.%f%z")
end_time = datetime.strptime('2011-01-01T12:00:00.000Z', "%Y-%m-%dT%H:%M:%S.%f%z")
resolution = ge.SpatialResolution(0.01, 0.01)
extent = ge.BoundingBox2D(5.852490, 47.271121, 15.022059, 55.065334)

#Species selection
species = "species/Arnica montana" #Arnica
#Create a workflow to retrieve Arnica montana occurrences filtered by the German border and linked to weather, land use and topographic data.
workflow = ge.register_workflow({
    "type": "Vector",
    "operator": {
        "type": "RasterVectorJoin",
        "params": {
            "names": {
                "type": "names",
                "values": ["Ökosystematlas", "SRTM", "Mean Air Temperature", "Mean Climate Moisture Index", "Precipitation"]
            },  
            "temporalAggregation": "none",
            "featureAggregation": "first",
        },
        "sources": {
            "vector": { #Arnica montana #########################################
                "type": "PointInPolygonFilter", 
                "params": {},
                "sources": {
                    "points": {
                        "type": "OgrSource",
                        "params": {
                            "data": f"_:{gbif_prov_id}:`{species}`",
                            "attributeProjection": []
                        }
                    },
                    "polygons": {
                        "type": "OgrSource",
                        "params": {
                            "data": "germany"
                        }
                    }
                }
            }, 
            "rasters": [{ #Ökosystematlas ########################################
                "type": "RasterTypeConversion",
                "params": {
                    "outputDataType": "F32"
                },
                "sources": {
                    "raster": {
                        "type": "GdalSource",
                        "params": {
                            "data": "oekosystematlas"
                        },
                    }
                }
            },
            { #SRTM #########################################################
                "type": "RasterTypeConversion",
                "params": {
                    "outputDataType": "F32"
                },
                "sources": {
                    "raster": {
                        "type": "GdalSource",
                        "params": {
                            "data": "srtm"
                        },
                    }
                }
                
            },
            { #Mean Annual Air Temperature ##################################
                "type": "TemporalRasterAggregation",
                "params": {
                    "aggregation": {
                        "type": "mean",
                        "ignoreNoData": False
                    },
                    "window": {
                        "granularity": "years",
                        "step": 1
                    },
                    "windowReference": None,
                    "outputType": None,
                },
                "sources": {
                    "raster": {
                        "type": "RasterScaling",
                        "params": {
                            "slope": {
                                "type": "constant",
                                "value": 0.1
                            },
                            "offset": {
                                "type": "constant",
                                "value": -273.15
                            },
                            "outputMeasurement": {
                                "type": "continuous",
                                "measurement": "temperature",
                                "unit": "K/10"
                            },
                            "scalingMode": "mulSlopeAddOffset"
                        },
                        "sources": {
                            "raster": {
                                "type": "RasterTypeConversion",
                                "params": {
                                    "outputDataType": "F32"
                                },
                                "sources": {
                                    "raster": {
                                        "type": "GdalSource",
                                        "params": {
                                            "data": "mean_daily_air_temperature"
                                        }
                                    }
                                }
                            }
                        }
                    }
                }
            },
            { #Mean Annual Climate moisture indices #########################
                "type": "TemporalRasterAggregation",
                "params": {
                    "aggregation": {
                        "type": "mean",
                        "ignoreNoData": False
                    },
                    "window": {
                        "granularity": "years",
                        "step": 1
                    },
                    "windowReference": None,
                    "outputType": None,
                },
                "sources": {
                    "raster": {
                        "type": "RasterScaling",
                        "params": {
                            "slope": {
                                "type": "constant",
                                "value": 0.1
                            },
                            "offset": {
                                "type": "constant",
                                "value": 0
                            },
                            "outputMeasurement": {
                                "type": "continuous",
                                "measurement": "climate moisture",
                                "unit": "kg m^-2 month^-1"
                            },
                            "scalingMode": "mulSlopeAddOffset"
                        },
                        "sources": {
                            "raster": {
                                "type": "RasterTypeConversion",
                                "params": {
                                    "outputDataType": "F32"
                                },
                                "sources": {
                                    "raster": {
                                        "type": "GdalSource",
                                        "params": {
                                            "data": "monthly_climate_moisture_indicies"
                                        }
                                    }
                                }
                            }
                        }
                    }
                }
            },
            { #Sum Annual Precipitation ####################################
                "type": "TemporalRasterAggregation",
                "params": {
                    "aggregation": {
                        "type": "sum",
                        "ignoreNoData": False
                    },
                    "window": {
                        "granularity": "years",
                        "step": 1
                    },
                    "windowReference": None,
                    "outputType": None,
                },
                "sources": {
                    "raster": {
                        "type": "RasterScaling",
                        "params": {
                            "slope": {
                                "type": "constant",
                                "value": 0.1
                            },
                            "offset": {
                                "type": "constant",
                                "value": 0
                            },
                            "outputMeasurement": {
                                "type": "continuous",
                                "measurement": "precipitation",
                                "unit": "kg m-2 month^-1"
                            },
                            "scalingMode": "mulSlopeAddOffset"
                        },
                        "sources": {
                            "raster": {
                                "type": "RasterTypeConversion",
                                "params": {
                                    "outputDataType": "F32"
                                },
                                "sources": {
                                    "raster": {
                                        "type": "GdalSource",
                                        "params": {
                                            "data": "monthly_precipitation_amount"
                                        }
                                    }
                                }
                            }
                        }
                    }
                }
            }]
        },
    }
})
    
workflow
7582cfcb-3d36-5b86-bb72-e81cef584fae
#Request the data from Geo Engine into a geopandas dataframe
data = workflow.get_dataframe(
    ge.QueryRectangle(
        extent,
        ge.TimeInterval(start_time, end_time),
        resolution=resolution,
        srs="EPSG:4326"
    )
)

#Plot the data
data.plot()
<Axes: >

output_11_1.png

data
geometry Mean Air Temperature Mean Climate Moisture Index Precipitation SRTM basisofrecord gbifid scientificname Ökosystematlas start end
0 POINT (11.29000 50.47000) 6.600011 42.033333 1186.700073 684.0 HUMAN_OBSERVATION 1922039098 Arnica montana L. 12.0 2001-09-24 00:00:00+00:00 2001-09-24 00:00:00+00:00
1 POINT (10.04000 47.52000) 6.658340 108.008339 2084.500244 845.0 HUMAN_OBSERVATION 1922860404 Arnica montana L. 12.0 2001-08-21 00:00:00+00:00 2001-08-21 00:00:00+00:00
2 POINT (11.29000 50.42000) 7.016680 41.541668 1193.100098 638.0 HUMAN_OBSERVATION 1922902358 Arnica montana L. 6.0 2001-10-01 00:00:00+00:00 2001-10-01 00:00:00+00:00
3 POINT (10.04000 47.52000) 6.658340 108.008339 2084.500244 845.0 HUMAN_OBSERVATION 1922858802 Arnica montana L. 12.0 2001-07-11 00:00:00+00:00 2001-07-11 00:00:00+00:00
4 POINT (10.21000 47.37000) 3.758347 102.083336 1912.100098 1649.0 HUMAN_OBSERVATION 1926238160 Arnica montana L. 255.0 2001-07-04 00:00:00+00:00 2001-07-04 00:00:00+00:00
... ... ... ... ... ... ... ... ... ... ... ...
1551 POINT (11.18340 47.59551) 8.066678 41.883335 1305.400024 638.0 HUMAN_OBSERVATION 920659766 Arnica montana L. 2.0 2010-06-10 00:00:00+00:00 2010-06-10 00:00:00+00:00
1552 POINT (12.05000 50.11000) 7.183350 5.225000 784.700012 594.0 HUMAN_OBSERVATION 1806720955 Arnica montana L. 11.0 2010-06-22 00:00:00+00:00 2010-06-22 00:00:00+00:00
1553 POINT (13.04000 48.87000) 7.075012 46.866669 1302.099976 706.0 HUMAN_OBSERVATION 1927043392 Arnica montana L. 8.0 2010-06-16 00:00:00+00:00 2010-06-16 00:00:00+00:00
1554 POINT (12.04000 50.12000) 7.425008 0.308333 768.400024 557.0 HUMAN_OBSERVATION 1806720970 Arnica montana L. 6.0 2010-06-22 00:00:00+00:00 2010-06-22 00:00:00+00:00
1555 POINT (11.64000 50.09000) NaN NaN NaN 536.0 HUMAN_OBSERVATION 1946786537 Arnica montana L. 6.0 2011-01-01 00:00:00+00:00 2011-01-01 00:00:00+00:00

1556 rows × 11 columns

#Rounding and grouping of occurrences to create frequency along with predictor variable combination
training_data = data.round(3)
training_data = training_data.groupby(['Mean Air Temperature', 'Mean Climate Moisture Index', 'Precipitation', 'SRTM', 'Ökosystematlas']).size().reset_index(name='counts')
training_data
Mean Air Temperature Mean Climate Moisture Index Precipitation SRTM Ökosystematlas counts
0 -0.842 126.342 2321.8 2036.0 14.0 13
1 -0.717 178.900 2899.2 1938.0 16.0 3
2 0.275 153.200 2687.8 1811.0 255.0 20
3 0.858 123.850 2270.8 1798.0 14.0 1
4 0.900 109.475 2042.6 1822.0 11.0 1
... ... ... ... ... ... ...
347 9.500 -14.567 631.3 216.0 6.0 1
348 9.692 14.392 971.4 292.0 12.0 1
349 9.692 15.342 943.5 327.0 10.0 1
350 10.317 6.358 775.4 120.0 10.0 1
351 10.667 0.325 756.8 99.0 2.0 1

352 rows × 6 columns

training_data.sort_values('counts', ascending=False)
Mean Air Temperature Mean Climate Moisture Index Precipitation SRTM Ökosystematlas counts
52 5.850 128.183 2370.3 1072.0 8.0 54
24 3.858 121.725 2325.1 1071.0 14.0 43
147 7.042 33.925 1123.1 565.0 13.0 43
28 4.258 102.975 2073.7 1435.0 11.0 38
25 3.875 112.683 2174.2 1414.0 12.0 36
... ... ... ... ... ... ...
116 6.808 72.058 1677.0 875.0 8.0 1
114 6.808 18.958 948.8 668.0 8.0 1
113 6.800 68.783 1684.7 1018.0 8.0 1
232 7.617 39.742 1317.6 729.0 15.0 1
351 10.667 0.325 756.8 99.0 2.0 1

352 rows × 6 columns

Create Prediction Data

This chapter shows how to register the workflow to create prediction data.

#Create a workflow to request weather, land use and topographic data as a raster stack.
prediction_workflow = ge.register_workflow({
    "type": "Raster",
    "operator": {
          "type": "RasterStacker",
          "params": {
            "renameBands": {
              "type": "rename",
              "values": ["Ökosystematlas", "SRTM", "Mean Air Temperature", "Mean Climate Moisture Index", "Precipitation"]
            }
          },
          "sources": {
            "rasters": [{ #Ökosystematlas ########################################
                "type": "RasterTypeConversion",
                "params": {
                    "outputDataType": "F32"
                },
                "sources": {
                    "raster": {
                        "type": "GdalSource",
                        "params": {
                            "data": "oekosystematlas"
                        },
                    }
                }
            },
            { #SRTM #########################################################
                "type": "RasterTypeConversion",
                "params": {
                    "outputDataType": "F32"
                },
                "sources": {
                    "raster": {
                        "type": "GdalSource",
                        "params": {
                            "data": "srtm"
                        },
                    }
                }
                
            },
            { #Mean Annual Air Temperature ##################################
                "type": "TemporalRasterAggregation",
                "params": {
                    "aggregation": {
                        "type": "mean",
                        "ignoreNoData": False
                    },
                    "window": {
                        "granularity": "years",
                        "step": 1
                    },
                    "windowReference": None,
                    "outputType": None,
                },
                "sources": {
                    "raster": {
                        "type": "RasterScaling",
                        "params": {
                            "slope": {
                                "type": "constant",
                                "value": 0.1
                            },
                            "offset": {
                                "type": "constant",
                                "value": -273.15
                            },
                            "outputMeasurement": {
                                "type": "continuous",
                                "measurement": "temperature",
                                "unit": "K/10"
                            },
                            "scalingMode": "mulSlopeAddOffset"
                        },
                        "sources": {
                            "raster": {
                                "type": "RasterTypeConversion",
                                "params": {
                                    "outputDataType": "F32"
                                },
                                "sources": {
                                    "raster": {
                                        "type": "GdalSource",
                                        "params": {
                                            "data": "mean_daily_air_temperature"
                                        }
                                    }
                                }
                            }
                        }
                    }
                }
            },
            { #Mean Annual Climate moisture indices #########################
                "type": "TemporalRasterAggregation",
                "params": {
                    "aggregation": {
                        "type": "mean",
                        "ignoreNoData": False
                    },
                    "window": {
                        "granularity": "years",
                        "step": 1
                    },
                    "windowReference": None,
                    "outputType": None,
                },
                "sources": {
                    "raster": {
                        "type": "RasterScaling",
                        "params": {
                            "slope": {
                                "type": "constant",
                                "value": 0.1
                            },
                            "offset": {
                                "type": "constant",
                                "value": 0
                            },
                            "outputMeasurement": {
                                "type": "continuous",
                                "measurement": "climate moisture",
                                "unit": "kg m^-2 month^-1"
                            },
                            "scalingMode": "mulSlopeAddOffset"
                        },
                        "sources": {
                            "raster": {
                                "type": "RasterTypeConversion",
                                "params": {
                                    "outputDataType": "F32"
                                },
                                "sources": {
                                    "raster": {
                                        "type": "GdalSource",
                                        "params": {
                                            "data": "monthly_climate_moisture_indicies"
                                        }
                                    }
                                }
                            }
                        }
                    }
                }
            },
            { #Sum Annual Precipitation ####################################
                "type": "TemporalRasterAggregation",
                "params": {
                    "aggregation": {
                        "type": "sum",
                        "ignoreNoData": False
                    },
                    "window": {
                        "granularity": "years",
                        "step": 1
                    },
                    "windowReference": None,
                    "outputType": None,
                },
                "sources": {
                    "raster": {
                        "type": "RasterScaling",
                        "params": {
                            "slope": {
                                "type": "constant",
                                "value": 0.1
                            },
                            "offset": {
                                "type": "constant",
                                "value": 0
                            },
                            "outputMeasurement": {
                                "type": "continuous",
                                "measurement": "precipitation",
                                "unit": "kg m-2 month^-1"
                            },
                            "scalingMode": "mulSlopeAddOffset"
                        },
                        "sources": {
                            "raster": {
                                "type": "RasterTypeConversion",
                                "params": {
                                    "outputDataType": "F32"
                                },
                                "sources": {
                                    "raster": {
                                        "type": "GdalSource",
                                        "params": {
                                            "data": "monthly_precipitation_amount"
                                        }
                                    }
                                }
                            }
                        }
                    }
                }
            }]
          }
        }
    
})

prediction_workflow
370296a3-db66-599b-8e55-2a4bf362a09a
#Preparing of the boundaries for the workflow raster stream
bbox = ge.QueryRectangle(
    extent,
    ge.TimeInterval(start_time, start_time),
    resolution=resolution,
    srs="EPSG:4326"
)
nest_asyncio.apply()

async def get_prediction_data(workflow, bbox, bands=[0,1,2,3,4], clip=True):
    data = await workflow.raster_stream_into_xarray(bbox, bands=bands, clip_to_query_rectangle=clip)
    data.to_dataset(name="prediction")
    return data

async def main(extent, time, resolution, workflow):
    bbox = ge.QueryRectangle(extent, ge.TimeInterval(time, time), resolution=resolution, srs="EPSG:4326")
    return await get_prediction_data(workflow, bbox)

try:
    loop = asyncio.get_event_loop()
except RuntimeError:
    loop = asyncio.new_event_loop()
    asyncio.set_event_loop(loop)

prediction_data = loop.run_until_complete(main(extent, start_time, resolution, prediction_workflow))
prediction_data.to_dataset(name="prediction")
/home/duempelmann/geoengine_env/lib/python3.10/site-packages/rasterio/windows.py:314: RasterioDeprecationWarning: The height, width, and precision parameters are unused, deprecated, and will be removed in 2.0.0.
  warnings.warn(
<xarray.Dataset> Size: 14MB
Dimensions:      (x: 918, y: 780, time: 1, band: 5)
Coordinates:

  • x (x) float64 7kB 5.855 5.865 5.875 5.885 ... 15.0 15.01 15.02

  • y (y) float64 6kB 55.07 55.06 55.05 55.04 ... 47.3 47.29 47.28

  • time (time) datetime64[ns] 8B 2001-01-01

  • band (band) int64 40B 0 1 2 3 4 spatial_ref int64 8B 0 Data variables: prediction (time, band, y, x) float32 14MB 21.0 21.0 ... 1.082e+03