Sign up to take part
Registered users can ask their own questions, contribute to discussions, and be part of the Community!
Registered users can ask their own questions, contribute to discussions, and be part of the Community!
Dear Dataiku Community,
my organization will still use Dataiku 9.0.4 for a while, before updating to Dataiku 10.
In this version, it seems like I can only create an Evaluation Recipe using the Graphical User Interface.
Indeed, the API code:
erc = p.new_recipe(type='evaluation', name=f"evaluate_on_mytest")
erc.with_input(f"mytest")
erc.with_input_model(f"mymodel")
erc.with_output(f"myscores")
erc.with_output_metrics(f"mymetrics")
er = erc.create()
er.run(wait=True)
should work well in Dataiku 10 but it is not working in Dataiku 9 because there is not such a thing as type='evaluation' for the RecipeCreator, i.e., an EvaluationRecipeCreator in https://github.com/dataiku/dataiku-api-client-python/blob/release/9.0/dataikuapi/dss/recipe.py.
I'm wondering if there is an alternative way to create an Evaluation Recipe in Dataiku 9.0.4 via API.
Thanks for your attention,
Daniele
Operating system used: Unix
Hi,
Indeed, this will be much simpler in v10+ using the code you pasted 🙂
Below is a workaround that you can try for your v9 instance. Note that the raw_payload at the bottom works for classification models. If you're doing a regression model, you'll need to change this (as noted in the comments).
import dataiku
def create_managed_dataset(project_key, dataset_name, connection_name):
"""
Creates a managed dataset in a given project, in a given connection
"""
client = dataiku.api_client()
project = client.get_project(project_key)
ds_helper = project.new_managed_dataset_creation_helper(dataset_name).with_store_into(connection_name)
ds_helper.create()
def create_datasets_for_evaluation_recipe(project_key, main_output_dataset, metrics_output_dataset, connection_name):
"""
Creates two managed datasets (for an evaluation recipe) in a given project, in a given connection
"""
create_managed_dataset(project_key, main_output_dataset, connection_name)
create_managed_dataset(project_key, metrics_output_dataset, connection_name)
def create_evaluation_recipe_shell(project_key, input_dataset_name, model_id, main_output_dataset, metrics_output_dataset, connection_name):
"""
Creates a new 'shell' evaluation recipe in a given project
Returns the recipe name
"""
client = dataiku.api_client()
project = client.get_project(project_key)
create_datasets_for_evaluation_recipe(project_key, main_output_dataset, metrics_output_dataset, connection_name)
recipe_name = 'evaluate_on_' + input_dataset_name
recipe_proto = {"type": 'evaluation',
"name": recipe_name,
"inputs": {'main': {'items': [{'deps': [],
'ref': input_dataset_name}]},
'model': {'items': [{'deps': [], 'ref': model_id}]}},
"outputs": {'main': {'items': [{'appendMode': False,
'ref': main_output_dataset}]},
'metrics': {'items': [{'appendMode': True,
'ref': metrics_output_dataset}]}}
}
creation_settings = {}
creation_settings["rawCreation"] = True
new_recipe = project.create_recipe(recipe_proto, creation_settings)
return new_recipe.name
def update_evaluation_recipe_shell(project_key, recipe_name, raw_payload):
"""
Updates a 'shell' evaluation recipe with a proper raw payload
"""
client = dataiku.api_client()
project = client.get_project(project_key)
recipe = project.get_recipe(recipe_name)
recipe_settings = recipe.get_settings()
recipe_settings.set_payload(raw_payload)
recipe_settings.save()
def create_new_evaluation_recipe(project_key, input_dataset_name, model_id, main_output_dataset, metrics_output_dataset, connection_name, raw_payload):
"""
Creates a new evaluation recipe
"""
new_recipe_name = create_evaluation_recipe_shell(project_key, input_dataset_name, model_id, main_output_dataset, metrics_output_dataset, connection_name)
update_evaluation_recipe_shell(project_key, new_recipe_name, raw_payload)
project_key = "PROJECT_KEY"
input_dataset_name = "INPUT_DATASET_NAME"
model_id = "MODEL_ID"
main_output_dataset = "NEW_MAIN_OUTPUT_DATASET_NAME"
metrics_output_dataset = "NEW_METRICS_OUTPUT_DATASET_NAME"
connection_name = "CONNECTION_NAME" #e.g. "filesystem_managed
# This raw payload works for classification models. To get the payload for Regression models, you'll need to get the settings of another evaluate recipe (project.get_recipe().get_settings().get_payload()) and copy over
# You may want to change some of these values if you'd like to see different metrics in your output dataset, or enable GPUs for scoring, for example
raw_payload = '{\n "outputs": [\n "prediction_correct"\n ],\n "perGPUMemoryFraction": 0.5,\n "sqlPipelineParams": {\n "pipelineAllowMerge": true,\n "pipelineAllowStart": true\n },\n "backendType": "PY_MEMORY",\n "filterInputColumns": false,\n "useGPU": false,\n "keptInputColumns": [],\n "pythonBatchSize": 100000,\n "gpuAllowGrowth": false,\n "outputProbaPercentiles": false,\n "outputProbabilities": true,\n "forceOriginalEngine": false,\n "metrics": [\n "precision",\n "recall",\n "auc",\n "f1",\n "accuracy",\n "mcc",\n "hammingLoss",\n "logLoss",\n "lift",\n "calibrationLoss",\n "customScore"\n ],\n "individualExplanationParams": {\n "subChunkSize": 5000,\n "method": "ICE",\n "shapleyBackgroundSize": 100,\n "nbExplanations": 3\n },\n "forcedClassifierThreshold": 0,\n "batchSize": 100,\n "outputExplanations": false,\n "overrideModelSpecifiedThreshold": false,\n "sparkParams": {\n "pipelineAllowMerge": true,\n "sparkPreparedDFStorageLevel": "MEMORY_AND_DISK",\n "pipelineAllowStart": true,\n "sparkExecutionEngine": "SPARK_SUBMIT",\n "sparkConf": {\n "inheritConf": "default",\n "conf": []\n },\n "sparkRepartitionNonHDFS": 1,\n "sparkUseGlobalMetastore": false\n },\n "gpuList": [\n 0\n ]\n}'
create_new_evaluation_recipe(project_key, input_dataset_name, model_id, main_output_dataset, metrics_output_dataset, connection_name, raw_payload)
Best,
Pat
Hi,
Indeed, this will be much simpler in v10+ using the code you pasted 🙂
Below is a workaround that you can try for your v9 instance. Note that the raw_payload at the bottom works for classification models. If you're doing a regression model, you'll need to change this (as noted in the comments).
import dataiku
def create_managed_dataset(project_key, dataset_name, connection_name):
"""
Creates a managed dataset in a given project, in a given connection
"""
client = dataiku.api_client()
project = client.get_project(project_key)
ds_helper = project.new_managed_dataset_creation_helper(dataset_name).with_store_into(connection_name)
ds_helper.create()
def create_datasets_for_evaluation_recipe(project_key, main_output_dataset, metrics_output_dataset, connection_name):
"""
Creates two managed datasets (for an evaluation recipe) in a given project, in a given connection
"""
create_managed_dataset(project_key, main_output_dataset, connection_name)
create_managed_dataset(project_key, metrics_output_dataset, connection_name)
def create_evaluation_recipe_shell(project_key, input_dataset_name, model_id, main_output_dataset, metrics_output_dataset, connection_name):
"""
Creates a new 'shell' evaluation recipe in a given project
Returns the recipe name
"""
client = dataiku.api_client()
project = client.get_project(project_key)
create_datasets_for_evaluation_recipe(project_key, main_output_dataset, metrics_output_dataset, connection_name)
recipe_name = 'evaluate_on_' + input_dataset_name
recipe_proto = {"type": 'evaluation',
"name": recipe_name,
"inputs": {'main': {'items': [{'deps': [],
'ref': input_dataset_name}]},
'model': {'items': [{'deps': [], 'ref': model_id}]}},
"outputs": {'main': {'items': [{'appendMode': False,
'ref': main_output_dataset}]},
'metrics': {'items': [{'appendMode': True,
'ref': metrics_output_dataset}]}}
}
creation_settings = {}
creation_settings["rawCreation"] = True
new_recipe = project.create_recipe(recipe_proto, creation_settings)
return new_recipe.name
def update_evaluation_recipe_shell(project_key, recipe_name, raw_payload):
"""
Updates a 'shell' evaluation recipe with a proper raw payload
"""
client = dataiku.api_client()
project = client.get_project(project_key)
recipe = project.get_recipe(recipe_name)
recipe_settings = recipe.get_settings()
recipe_settings.set_payload(raw_payload)
recipe_settings.save()
def create_new_evaluation_recipe(project_key, input_dataset_name, model_id, main_output_dataset, metrics_output_dataset, connection_name, raw_payload):
"""
Creates a new evaluation recipe
"""
new_recipe_name = create_evaluation_recipe_shell(project_key, input_dataset_name, model_id, main_output_dataset, metrics_output_dataset, connection_name)
update_evaluation_recipe_shell(project_key, new_recipe_name, raw_payload)
project_key = "PROJECT_KEY"
input_dataset_name = "INPUT_DATASET_NAME"
model_id = "MODEL_ID"
main_output_dataset = "NEW_MAIN_OUTPUT_DATASET_NAME"
metrics_output_dataset = "NEW_METRICS_OUTPUT_DATASET_NAME"
connection_name = "CONNECTION_NAME" #e.g. "filesystem_managed
# This raw payload works for classification models. To get the payload for Regression models, you'll need to get the settings of another evaluate recipe (project.get_recipe().get_settings().get_payload()) and copy over
# You may want to change some of these values if you'd like to see different metrics in your output dataset, or enable GPUs for scoring, for example
raw_payload = '{\n "outputs": [\n "prediction_correct"\n ],\n "perGPUMemoryFraction": 0.5,\n "sqlPipelineParams": {\n "pipelineAllowMerge": true,\n "pipelineAllowStart": true\n },\n "backendType": "PY_MEMORY",\n "filterInputColumns": false,\n "useGPU": false,\n "keptInputColumns": [],\n "pythonBatchSize": 100000,\n "gpuAllowGrowth": false,\n "outputProbaPercentiles": false,\n "outputProbabilities": true,\n "forceOriginalEngine": false,\n "metrics": [\n "precision",\n "recall",\n "auc",\n "f1",\n "accuracy",\n "mcc",\n "hammingLoss",\n "logLoss",\n "lift",\n "calibrationLoss",\n "customScore"\n ],\n "individualExplanationParams": {\n "subChunkSize": 5000,\n "method": "ICE",\n "shapleyBackgroundSize": 100,\n "nbExplanations": 3\n },\n "forcedClassifierThreshold": 0,\n "batchSize": 100,\n "outputExplanations": false,\n "overrideModelSpecifiedThreshold": false,\n "sparkParams": {\n "pipelineAllowMerge": true,\n "sparkPreparedDFStorageLevel": "MEMORY_AND_DISK",\n "pipelineAllowStart": true,\n "sparkExecutionEngine": "SPARK_SUBMIT",\n "sparkConf": {\n "inheritConf": "default",\n "conf": []\n },\n "sparkRepartitionNonHDFS": 1,\n "sparkUseGlobalMetastore": false\n },\n "gpuList": [\n 0\n ]\n}'
create_new_evaluation_recipe(project_key, input_dataset_name, model_id, main_output_dataset, metrics_output_dataset, connection_name, raw_payload)
Best,
Pat