code provided in "Sessionization in Python" does not work
Trying to go thru Python and Dataiku DSS tutorial, Sessionization in SQL, Hive, Pig and Python, I get the following error
Job failed: Error in Python process: At line 18: <type 'exceptions.TypeError'>: unsupported operand type(s) for -: 'str' and 'float'
when executing python code
# -------------------------------------------------------------------------------- NOTEBOOK-CELL: CODE
import dataiku
import pandas as pd
from datetime import timedelta
# define treshold value
T = timedelta(seconds=30*60)
# load dataset
toy_data = dataiku.Dataset("toy_data").get_dataframe()
# add a column containing previous timestamp
toy_data = pd.concat([toy_data, toy_data.groupby('user_id').transform(lambda x:x.shift(1))],axis=1)
toy_data.columns = ['user_id','mytimestamp','prev_mytimestamp']
# -------------------------------------------------------------------------------- NOTEBOOK-CELL: CODE
# create the new session column
toy_data['new_session'] = ((toy_data['mytimestamp'] - toy_data['prev_mytimestamp'])>=T).astype(int)
# -------------------------------------------------------------------------------- NOTEBOOK-CELL: CODE
# create the session_id
toy_data['increment'] = toy_data.groupby("user_id")['new_session'].cumsum()
toy_data['session_id'] = toy_data['user_id'].astype(str) + '_' + toy_data['increment'].astype(str)
# -------------------------------------------------------------------------------- NOTEBOOK-CELL: CODE
# to get the same result as with hive/postgresql
toy_data = toy_data.sort(['user_id','mytimestamp'])
# -------------------------------------------------------------------------------- NOTEBOOK-CELL: CODE
# Write recipe outputs
sessionization_in_Python = dataiku.Dataset("Sessionization_in_Python")
sessionization_in_Python.write_with_schema(toy_data)
Best Answer
-
Herve Partner, Dataiku DSS Core Designer, Dataiku DSS & SQL, Dataiku DSS ML Practitioner, Dataiku DSS Core Concepts, Dataiku DSS Adv Designer, Registered Posts: 58 Partner
Just sharing working code :
# -------------------------------------------------------------------------------- NOTEBOOK-CELL: CODE
import dataiku
import pandas as pd
import datetime
from datetime import timedelta# define treshold value
T = timedelta(seconds=30*60)# load dataset
toy_data = dataiku.Dataset("toy_data").get_dataframe()# -------------------------------------------------------------------------------- NOTEBOOK-CELL: CODE
# add a column containing previous timestamp
toy_data = pd.concat([toy_data, toy_data.transform(lambda x:x.shift(1))],axis=1)# -------------------------------------------------------------------------------- NOTEBOOK-CELL: CODE
toy_data.columns = ['user_id','mytimestamp','dup_user_id','prev_mytimestamp']# -------------------------------------------------------------------------------- NOTEBOOK-CELL: CODE
toy_data.drop(toy_data.columns[2], axis=1,inplace=True)# -------------------------------------------------------------------------------- NOTEBOOK-CELL: CODE
# create the new session column
toy_data['new_session'] = [0]*16
for i in range(1,16):
toy_data['new_session'][i] = int(datetime.datetime.strptime(toy_data['mytimestamp'][i] , '%Y-%m-%dT%H:%M:%S') - datetime.datetime.strptime(toy_data['prev_mytimestamp'][i], '%Y-%m-%dT%H:%M:%S')>=T)# -------------------------------------------------------------------------------- NOTEBOOK-CELL: CODE
# create the session_id
toy_data['increment'] = toy_data.groupby("user_id")['new_session'].cumsum()# -------------------------------------------------------------------------------- NOTEBOOK-CELL: CODE
toy_data['session_id'] = toy_data['user_id'].astype(str) + '_' + toy_data['increment'].astype(str)# -------------------------------------------------------------------------------- NOTEBOOK-CELL: CODE
# to get the same result as with hive/postgresql
toy_data = toy_data.sort_values(by=['user_id','mytimestamp'])# -------------------------------------------------------------------------------- NOTEBOOK-CELL: CODE
print toy_data# -------------------------------------------------------------------------------- NOTEBOOK-CELL: CODE
# Write recipe outputs
sessionization_in_Python = dataiku.Dataset("Sessionization_in_Python")
sessionization_in_Python.write_with_schema(toy_data)