Create Evaluation Recipe with API in Dataiku 9

Solved!
dongari
Level 2
Create Evaluation Recipe with API in Dataiku 9

Dear Dataiku Community, 

my organization will still use Dataiku 9.0.4 for a while, before updating to Dataiku 10. 

In this version, it seems like I can only create an Evaluation Recipe using the Graphical User Interface.
Indeed, the API code:

erc = p.new_recipe(type='evaluation', name=f"evaluate_on_mytest") 
erc.with_input(f"mytest")
erc.with_input_model(f"mymodel")
erc.with_output(f"myscores")
erc.with_output_metrics(f"mymetrics")
er = erc.create()
er.run(wait=True)

should work well in Dataiku 10 but it is not working in Dataiku 9 because there is not such a thing as type='evaluation' for the RecipeCreator, i.e., an EvaluationRecipeCreator in https://github.com/dataiku/dataiku-api-client-python/blob/release/9.0/dataikuapi/dss/recipe.py.

I'm wondering if there is an alternative way to create an Evaluation Recipe in Dataiku 9.0.4 via API.

Thanks for your attention,

Daniele


Operating system used: Unix

0 Kudos
1 Solution
pmasiphelps
Dataiker

Hi,

 

Indeed, this will be much simpler in v10+ using the code you pasted ๐Ÿ™‚ 

Below is a workaround that you can try for your v9 instance. Note that the raw_payload at the bottom works for classification models. If you're doing a regression model, you'll need to change this (as noted in the comments).

 

import dataiku

def create_managed_dataset(project_key, dataset_name, connection_name):
    """
    Creates a managed dataset in a given project, in a given connection
    """
    client = dataiku.api_client()
    project = client.get_project(project_key)
    
    ds_helper = project.new_managed_dataset_creation_helper(dataset_name).with_store_into(connection_name)
    
    ds_helper.create()

def create_datasets_for_evaluation_recipe(project_key, main_output_dataset, metrics_output_dataset, connection_name):
    """
    Creates two managed datasets (for an evaluation recipe) in a given project, in a given connection
    """
    create_managed_dataset(project_key, main_output_dataset, connection_name)
    create_managed_dataset(project_key, metrics_output_dataset, connection_name)

def create_evaluation_recipe_shell(project_key, input_dataset_name, model_id, main_output_dataset, metrics_output_dataset, connection_name):
    """
    Creates a new 'shell' evaluation recipe in a given project
    Returns the recipe name
    """
    client = dataiku.api_client()
    project = client.get_project(project_key)
    
    create_datasets_for_evaluation_recipe(project_key, main_output_dataset, metrics_output_dataset, connection_name)
    
    recipe_name = 'evaluate_on_' + input_dataset_name
    
    recipe_proto = {"type": 'evaluation',
                    "name": recipe_name,
                    "inputs": {'main': {'items': [{'deps': [],
                                  'ref': input_dataset_name}]},
                               'model': {'items': [{'deps': [], 'ref': model_id}]}},
                    "outputs": {'main': {'items': [{'appendMode': False,
                                  'ref': main_output_dataset}]},
                                'metrics': {'items': [{'appendMode': True,
                                  'ref': metrics_output_dataset}]}}
                    }
    
    creation_settings = {}
    creation_settings["rawCreation"] = True
    
    new_recipe = project.create_recipe(recipe_proto, creation_settings)
    
    return new_recipe.name

def update_evaluation_recipe_shell(project_key, recipe_name, raw_payload):
    """
    Updates a 'shell' evaluation recipe with a proper raw payload
    """
    client = dataiku.api_client()
    project = client.get_project(project_key)
    
    recipe = project.get_recipe(recipe_name)
    recipe_settings = recipe.get_settings()
    recipe_settings.set_payload(raw_payload)
    recipe_settings.save()
    

def create_new_evaluation_recipe(project_key, input_dataset_name, model_id, main_output_dataset, metrics_output_dataset, connection_name, raw_payload):
    """
    Creates a new evaluation recipe
    """
    new_recipe_name = create_evaluation_recipe_shell(project_key, input_dataset_name, model_id, main_output_dataset, metrics_output_dataset, connection_name)
    update_evaluation_recipe_shell(project_key, new_recipe_name, raw_payload)

project_key = "PROJECT_KEY"
input_dataset_name = "INPUT_DATASET_NAME"
model_id = "MODEL_ID"
main_output_dataset = "NEW_MAIN_OUTPUT_DATASET_NAME"
metrics_output_dataset = "NEW_METRICS_OUTPUT_DATASET_NAME"
connection_name = "CONNECTION_NAME" #e.g. "filesystem_managed

# This raw payload works for classification models. To get the payload for Regression models, you'll need to get the settings of another evaluate recipe (project.get_recipe().get_settings().get_payload()) and copy over
# You may want to change some of these values if you'd like to see different metrics in your output dataset, or enable GPUs for scoring, for example
raw_payload = '{\n  "outputs": [\n    "prediction_correct"\n  ],\n  "perGPUMemoryFraction": 0.5,\n  "sqlPipelineParams": {\n    "pipelineAllowMerge": true,\n    "pipelineAllowStart": true\n  },\n  "backendType": "PY_MEMORY",\n  "filterInputColumns": false,\n  "useGPU": false,\n  "keptInputColumns": [],\n  "pythonBatchSize": 100000,\n  "gpuAllowGrowth": false,\n  "outputProbaPercentiles": false,\n  "outputProbabilities": true,\n  "forceOriginalEngine": false,\n  "metrics": [\n    "precision",\n    "recall",\n    "auc",\n    "f1",\n    "accuracy",\n    "mcc",\n    "hammingLoss",\n    "logLoss",\n    "lift",\n    "calibrationLoss",\n    "customScore"\n  ],\n  "individualExplanationParams": {\n    "subChunkSize": 5000,\n    "method": "ICE",\n    "shapleyBackgroundSize": 100,\n    "nbExplanations": 3\n  },\n  "forcedClassifierThreshold": 0,\n  "batchSize": 100,\n  "outputExplanations": false,\n  "overrideModelSpecifiedThreshold": false,\n  "sparkParams": {\n    "pipelineAllowMerge": true,\n    "sparkPreparedDFStorageLevel": "MEMORY_AND_DISK",\n    "pipelineAllowStart": true,\n    "sparkExecutionEngine": "SPARK_SUBMIT",\n    "sparkConf": {\n      "inheritConf": "default",\n      "conf": []\n    },\n    "sparkRepartitionNonHDFS": 1,\n    "sparkUseGlobalMetastore": false\n  },\n  "gpuList": [\n    0\n  ]\n}'

create_new_evaluation_recipe(project_key, input_dataset_name, model_id, main_output_dataset, metrics_output_dataset, connection_name, raw_payload)

 

Best,

Pat

 

View solution in original post

0 Kudos
1 Reply
pmasiphelps
Dataiker

Hi,

 

Indeed, this will be much simpler in v10+ using the code you pasted ๐Ÿ™‚ 

Below is a workaround that you can try for your v9 instance. Note that the raw_payload at the bottom works for classification models. If you're doing a regression model, you'll need to change this (as noted in the comments).

 

import dataiku

def create_managed_dataset(project_key, dataset_name, connection_name):
    """
    Creates a managed dataset in a given project, in a given connection
    """
    client = dataiku.api_client()
    project = client.get_project(project_key)
    
    ds_helper = project.new_managed_dataset_creation_helper(dataset_name).with_store_into(connection_name)
    
    ds_helper.create()

def create_datasets_for_evaluation_recipe(project_key, main_output_dataset, metrics_output_dataset, connection_name):
    """
    Creates two managed datasets (for an evaluation recipe) in a given project, in a given connection
    """
    create_managed_dataset(project_key, main_output_dataset, connection_name)
    create_managed_dataset(project_key, metrics_output_dataset, connection_name)

def create_evaluation_recipe_shell(project_key, input_dataset_name, model_id, main_output_dataset, metrics_output_dataset, connection_name):
    """
    Creates a new 'shell' evaluation recipe in a given project
    Returns the recipe name
    """
    client = dataiku.api_client()
    project = client.get_project(project_key)
    
    create_datasets_for_evaluation_recipe(project_key, main_output_dataset, metrics_output_dataset, connection_name)
    
    recipe_name = 'evaluate_on_' + input_dataset_name
    
    recipe_proto = {"type": 'evaluation',
                    "name": recipe_name,
                    "inputs": {'main': {'items': [{'deps': [],
                                  'ref': input_dataset_name}]},
                               'model': {'items': [{'deps': [], 'ref': model_id}]}},
                    "outputs": {'main': {'items': [{'appendMode': False,
                                  'ref': main_output_dataset}]},
                                'metrics': {'items': [{'appendMode': True,
                                  'ref': metrics_output_dataset}]}}
                    }
    
    creation_settings = {}
    creation_settings["rawCreation"] = True
    
    new_recipe = project.create_recipe(recipe_proto, creation_settings)
    
    return new_recipe.name

def update_evaluation_recipe_shell(project_key, recipe_name, raw_payload):
    """
    Updates a 'shell' evaluation recipe with a proper raw payload
    """
    client = dataiku.api_client()
    project = client.get_project(project_key)
    
    recipe = project.get_recipe(recipe_name)
    recipe_settings = recipe.get_settings()
    recipe_settings.set_payload(raw_payload)
    recipe_settings.save()
    

def create_new_evaluation_recipe(project_key, input_dataset_name, model_id, main_output_dataset, metrics_output_dataset, connection_name, raw_payload):
    """
    Creates a new evaluation recipe
    """
    new_recipe_name = create_evaluation_recipe_shell(project_key, input_dataset_name, model_id, main_output_dataset, metrics_output_dataset, connection_name)
    update_evaluation_recipe_shell(project_key, new_recipe_name, raw_payload)

project_key = "PROJECT_KEY"
input_dataset_name = "INPUT_DATASET_NAME"
model_id = "MODEL_ID"
main_output_dataset = "NEW_MAIN_OUTPUT_DATASET_NAME"
metrics_output_dataset = "NEW_METRICS_OUTPUT_DATASET_NAME"
connection_name = "CONNECTION_NAME" #e.g. "filesystem_managed

# This raw payload works for classification models. To get the payload for Regression models, you'll need to get the settings of another evaluate recipe (project.get_recipe().get_settings().get_payload()) and copy over
# You may want to change some of these values if you'd like to see different metrics in your output dataset, or enable GPUs for scoring, for example
raw_payload = '{\n  "outputs": [\n    "prediction_correct"\n  ],\n  "perGPUMemoryFraction": 0.5,\n  "sqlPipelineParams": {\n    "pipelineAllowMerge": true,\n    "pipelineAllowStart": true\n  },\n  "backendType": "PY_MEMORY",\n  "filterInputColumns": false,\n  "useGPU": false,\n  "keptInputColumns": [],\n  "pythonBatchSize": 100000,\n  "gpuAllowGrowth": false,\n  "outputProbaPercentiles": false,\n  "outputProbabilities": true,\n  "forceOriginalEngine": false,\n  "metrics": [\n    "precision",\n    "recall",\n    "auc",\n    "f1",\n    "accuracy",\n    "mcc",\n    "hammingLoss",\n    "logLoss",\n    "lift",\n    "calibrationLoss",\n    "customScore"\n  ],\n  "individualExplanationParams": {\n    "subChunkSize": 5000,\n    "method": "ICE",\n    "shapleyBackgroundSize": 100,\n    "nbExplanations": 3\n  },\n  "forcedClassifierThreshold": 0,\n  "batchSize": 100,\n  "outputExplanations": false,\n  "overrideModelSpecifiedThreshold": false,\n  "sparkParams": {\n    "pipelineAllowMerge": true,\n    "sparkPreparedDFStorageLevel": "MEMORY_AND_DISK",\n    "pipelineAllowStart": true,\n    "sparkExecutionEngine": "SPARK_SUBMIT",\n    "sparkConf": {\n      "inheritConf": "default",\n      "conf": []\n    },\n    "sparkRepartitionNonHDFS": 1,\n    "sparkUseGlobalMetastore": false\n  },\n  "gpuList": [\n    0\n  ]\n}'

create_new_evaluation_recipe(project_key, input_dataset_name, model_id, main_output_dataset, metrics_output_dataset, connection_name, raw_payload)

 

Best,

Pat

 

0 Kudos