Create Evaluation Recipe with API in Dataiku 9
Dear Dataiku Community,
my organization will still use Dataiku 9.0.4 for a while, before updating to Dataiku 10.
In this version, it seems like I can only create an Evaluation Recipe using the Graphical User Interface.
Indeed, the API code:
erc = p.new_recipe(type='evaluation', name=f"evaluate_on_mytest") erc.with_input(f"mytest") erc.with_input_model(f"mymodel") erc.with_output(f"myscores") erc.with_output_metrics(f"mymetrics") er = erc.create() er.run(wait=True)
should work well in Dataiku 10 but it is not working in Dataiku 9 because there is not such a thing as type='evaluation' for the RecipeCreator, i.e., an EvaluationRecipeCreator in https://github.com/dataiku/dataiku-api-client-python/blob/release/9.0/dataikuapi/dss/recipe.py.
I'm wondering if there is an alternative way to create an Evaluation Recipe in Dataiku 9.0.4 via API.
Thanks for your attention,
Daniele
Operating system used: Unix
Best Answer
-
Hi,
Indeed, this will be much simpler in v10+ using the code you pasted
Below is a workaround that you can try for your v9 instance. Note that the raw_payload at the bottom works for classification models. If you're doing a regression model, you'll need to change this (as noted in the comments).
import dataiku def create_managed_dataset(project_key, dataset_name, connection_name): """ Creates a managed dataset in a given project, in a given connection """ client = dataiku.api_client() project = client.get_project(project_key) ds_helper = project.new_managed_dataset_creation_helper(dataset_name).with_store_into(connection_name) ds_helper.create() def create_datasets_for_evaluation_recipe(project_key, main_output_dataset, metrics_output_dataset, connection_name): """ Creates two managed datasets (for an evaluation recipe) in a given project, in a given connection """ create_managed_dataset(project_key, main_output_dataset, connection_name) create_managed_dataset(project_key, metrics_output_dataset, connection_name) def create_evaluation_recipe_shell(project_key, input_dataset_name, model_id, main_output_dataset, metrics_output_dataset, connection_name): """ Creates a new 'shell' evaluation recipe in a given project Returns the recipe name """ client = dataiku.api_client() project = client.get_project(project_key) create_datasets_for_evaluation_recipe(project_key, main_output_dataset, metrics_output_dataset, connection_name) recipe_name = 'evaluate_on_' + input_dataset_name recipe_proto = {"type": 'evaluation', "name": recipe_name, "inputs": {'main': {'items': [{'deps': [], 'ref': input_dataset_name}]}, 'model': {'items': [{'deps': [], 'ref': model_id}]}}, "outputs": {'main': {'items': [{'appendMode': False, 'ref': main_output_dataset}]}, 'metrics': {'items': [{'appendMode': True, 'ref': metrics_output_dataset}]}} } creation_settings = {} creation_settings["rawCreation"] = True new_recipe = project.create_recipe(recipe_proto, creation_settings) return new_recipe.name def update_evaluation_recipe_shell(project_key, recipe_name, raw_payload): """ Updates a 'shell' evaluation recipe with a proper raw payload """ client = dataiku.api_client() project = client.get_project(project_key) recipe = project.get_recipe(recipe_name) recipe_settings = recipe.get_settings() recipe_settings.set_payload(raw_payload) recipe_settings.save() def create_new_evaluation_recipe(project_key, input_dataset_name, model_id, main_output_dataset, metrics_output_dataset, connection_name, raw_payload): """ Creates a new evaluation recipe """ new_recipe_name = create_evaluation_recipe_shell(project_key, input_dataset_name, model_id, main_output_dataset, metrics_output_dataset, connection_name) update_evaluation_recipe_shell(project_key, new_recipe_name, raw_payload) project_key = "PROJECT_KEY" input_dataset_name = "INPUT_DATASET_NAME" model_id = "MODEL_ID" main_output_dataset = "NEW_MAIN_OUTPUT_DATASET_NAME" metrics_output_dataset = "NEW_METRICS_OUTPUT_DATASET_NAME" connection_name = "CONNECTION_NAME" #e.g. "filesystem_managed # This raw payload works for classification models. To get the payload for Regression models, you'll need to get the settings of another evaluate recipe (project.get_recipe().get_settings().get_payload()) and copy over # You may want to change some of these values if you'd like to see different metrics in your output dataset, or enable GPUs for scoring, for example raw_payload = '{\n "outputs": [\n "prediction_correct"\n ],\n "perGPUMemoryFraction": 0.5,\n "sqlPipelineParams": {\n "pipelineAllowMerge": true,\n "pipelineAllowStart": true\n },\n "backendType": "PY_MEMORY",\n "filterInputColumns": false,\n "useGPU": false,\n "keptInputColumns": [],\n "pythonBatchSize": 100000,\n "gpuAllowGrowth": false,\n "outputProbaPercentiles": false,\n "outputProbabilities": true,\n "forceOriginalEngine": false,\n "metrics": [\n "precision",\n "recall",\n "auc",\n "f1",\n "accuracy",\n "mcc",\n "hammingLoss",\n "logLoss",\n "lift",\n "calibrationLoss",\n "customScore"\n ],\n "individualExplanationParams": {\n "subChunkSize": 5000,\n "method": "ICE",\n "shapleyBackgroundSize": 100,\n "nbExplanations": 3\n },\n "forcedClassifierThreshold": 0,\n "batchSize": 100,\n "outputExplanations": false,\n "overrideModelSpecifiedThreshold": false,\n "sparkParams": {\n "pipelineAllowMerge": true,\n "sparkPreparedDFStorageLevel": "MEMORY_AND_DISK",\n "pipelineAllowStart": true,\n "sparkExecutionEngine": "SPARK_SUBMIT",\n "sparkConf": {\n "inheritConf": "default",\n "conf": []\n },\n "sparkRepartitionNonHDFS": 1,\n "sparkUseGlobalMetastore": false\n },\n "gpuList": [\n 0\n ]\n}' create_new_evaluation_recipe(project_key, input_dataset_name, model_id, main_output_dataset, metrics_output_dataset, connection_name, raw_payload)
Best,
Pat