How to generate a new Dataset from a custom recipe
Alan_Fusté
Partner, Registered Posts: 43 Partner
Hello, I'm trying to generate a new Dataset from a custom python recipe (inside a Plugin) but I'm getting always Error Disk (DSS can't find the Dataset - i can found it in the flow). I think that I'm not "linking" well the Dataset with the real file.
How can I do that?
Thanks!
My python code is:
output_path = 'path_to_dss/dataiku/data/uploads/' + input_project_name + '/datasets/' + output_dataset_name + '/' + output_dataset_name + '.csv.gz'
input_path = 'path_to_dss/dataiku/data/uploads/' + input_project_name + '/datasets/' + input_dataset_name + '/' + input_dataset_name + '.csv.gz'
if not os.path.exists('path_to_dss/dataiku/data/uploads/' + input_project_name + '/datasets/' + output_dataset_name + '/'):
os.mkdir('path_to_dss/dataiku/data/uploads/' + input_project_name + '/datasets/' + output_dataset_name + '/')
shutil.copy(input_path, output_path)
client = dataiku.api_client()
project = client.get_project(input_project_name)
output_dataset = project.create_dataset(output_dataset_name
, 'Filesystem'
, params={'connection': 'filesystem_root', 'path': output_path}
, formatType='csv'
, formatParams={'separator': ',', 'style': 'excel', 'parseHeaderRow': True}
)
output_dataset.set_schema(project.get_dataset(input_dataset_name).get_schema())
output_dataset = dataiku.Dataset(output_dataset_name, project_key=input_project_name, ignore_flow=True)
How can I do that?
Thanks!
My python code is:
output_path = 'path_to_dss/dataiku/data/uploads/' + input_project_name + '/datasets/' + output_dataset_name + '/' + output_dataset_name + '.csv.gz'
input_path = 'path_to_dss/dataiku/data/uploads/' + input_project_name + '/datasets/' + input_dataset_name + '/' + input_dataset_name + '.csv.gz'
if not os.path.exists('path_to_dss/dataiku/data/uploads/' + input_project_name + '/datasets/' + output_dataset_name + '/'):
os.mkdir('path_to_dss/dataiku/data/uploads/' + input_project_name + '/datasets/' + output_dataset_name + '/')
shutil.copy(input_path, output_path)
client = dataiku.api_client()
project = client.get_project(input_project_name)
output_dataset = project.create_dataset(output_dataset_name
, 'Filesystem'
, params={'connection': 'filesystem_root', 'path': output_path}
, formatType='csv'
, formatParams={'separator': ',', 'style': 'excel', 'parseHeaderRow': True}
)
output_dataset.set_schema(project.get_dataset(input_dataset_name).get_schema())
output_dataset = dataiku.Dataset(output_dataset_name, project_key=input_project_name, ignore_flow=True)
Tagged:
Best Answer
-
With Sync Recipe:
client = dataiku.api_client()
project = client.get_project(project_name)
middle_dataset_name = 'pre_' + output_dataset_name
builder = SyncRecipeCreator("sync_" + middle_dataset_name, project)
builder = builder.with_input(input_dataset_name)
builder = builder.with_new_output(middle_dataset_name, "filesystem_managed")
recipe = builder.build()
recipe_def = recipe.get_definition_and_payload()
recipe_payload = {}
recipe_def.data['recipe']['params']['schemaMode'] = 'FREE_SCHEMA_NAME_BASED'
recipe_def.set_json_payload(recipe_payload)
recipe.set_definition_and_payload(recipe_def)
middle_dataset = dataiku.Dataset(middle_dataset_name, project_key=project_name, ignore_flow=True)