Create N output datasets dynamically

info-rchitect · ‎02-20-2024

Hi,

I have a dataset which I want to partition into N datasets, where N will change over time. N is > 30 so I don't want to have to manually declare each output dataset in my Python recipe. It is easy enough in Python to create the N dataframes I want to use as the source for each dataset. Can I do this dynamically without declaring each output dataset manually?

thx

Operating system used: Windows 10

June · ‎02-20-2024

This can be done using python in a scenario.

Here is some sample code which dynamically creates & names tables and writes them as Dataiku tables.

"""From the Superstore Toy Data, create seperate datasets for each city"""

import dataiku
from dataiku import pandasutils as pdu
from dataiku import api_client
import datetime as dt
import numpy as np
import pandas as pd


#Instantiate the client
client=api_client() 
proj = client.get_default_project() 

#Manage where the output data will be stored
MY_DB_CNXN = 'My_Database_Connection' #This is the name of a Dataiku database connection to write to  OR
local_filesystem = 'filesystem_managed' #This can be used to write to the local filesystem
write_output_to = local_filesystem #By default we will use the local

# Read recipe inputs
Superstore = dataiku.Dataset("Superstore")
store_df = Superstore.get_dataframe()

#Pre-Process Text, Get Unique City Values
store_df['City'] = store_df['City'].fillna(value='OTHER')
store_df['City'] = store_df['City'].fillna('').astype(str).str.replace(r'[^A-Za-z ]', '', regex=True).replace('', np.nan, regex=False)
store_df['City'] = store_df['City'].str.upper()
store_df['City'] = store_df['City'].replace(' ', '_', regex=True)
cities = list(store_df.City.unique())

#Creating a small sample of unique cities so we only make 5 new datasets for this demo
sample = cities[0:4]

for i in range(len(sample)):
    tbl_name = sample[i]
    df = store_df[store_df['City']==tbl_name]
    
    #get or create dataset associated with the table name   
    if any([x.name == tbl_name for x in proj.list_datasets()]):
        dataset = proj.get_dataset(tbl_name)
    else:
        builder = proj.new_managed_dataset(tbl_name)
        builder.with_store_into(write_output_to)
        dataset = builder.create()
        
    #write output
    output_ds = dataiku.Dataset(tbl_name)
    output_ds.write_with_schema(df)

View solution in original post

Turribeach · ‎02-20-2024

@June Could you please use a code block (the </> icon in the toolbar) to post your code snippet as it has lost all padding so it won’t execute properly in Python.

View solution in original post

June · ‎02-20-2024

This can be done using python in a scenario.

Here is some sample code which dynamically creates & names tables and writes them as Dataiku tables.

"""From the Superstore Toy Data, create seperate datasets for each city"""

import dataiku
from dataiku import pandasutils as pdu
from dataiku import api_client
import datetime as dt
import numpy as np
import pandas as pd


#Instantiate the client
client=api_client() 
proj = client.get_default_project() 

#Manage where the output data will be stored
MY_DB_CNXN = 'My_Database_Connection' #This is the name of a Dataiku database connection to write to  OR
local_filesystem = 'filesystem_managed' #This can be used to write to the local filesystem
write_output_to = local_filesystem #By default we will use the local

# Read recipe inputs
Superstore = dataiku.Dataset("Superstore")
store_df = Superstore.get_dataframe()

#Pre-Process Text, Get Unique City Values
store_df['City'] = store_df['City'].fillna(value='OTHER')
store_df['City'] = store_df['City'].fillna('').astype(str).str.replace(r'[^A-Za-z ]', '', regex=True).replace('', np.nan, regex=False)
store_df['City'] = store_df['City'].str.upper()
store_df['City'] = store_df['City'].replace(' ', '_', regex=True)
cities = list(store_df.City.unique())

#Creating a small sample of unique cities so we only make 5 new datasets for this demo
sample = cities[0:4]

for i in range(len(sample)):
    tbl_name = sample[i]
    df = store_df[store_df['City']==tbl_name]
    
    #get or create dataset associated with the table name   
    if any([x.name == tbl_name for x in proj.list_datasets()]):
        dataset = proj.get_dataset(tbl_name)
    else:
        builder = proj.new_managed_dataset(tbl_name)
        builder.with_store_into(write_output_to)
        dataset = builder.create()
        
    #write output
    output_ds = dataiku.Dataset(tbl_name)
    output_ds.write_with_schema(df)

Turribeach · ‎02-20-2024

@June Could you please use a code block (the </> icon in the toolbar) to post your code snippet as it has lost all padding so it won’t execute properly in Python.

June · ‎02-20-2024

Done, thanks!

Create N output datasets dynamically

Create N output datasets dynamically

Labels

Setup info

Sign up to take part

Create N output datasets dynamically

Create N output datasets dynamically

Labels

Setup info