Submit your use case or success story to the 2023 edition of the Dataiku Frontrunner Awards ENTER YOUR SUBMISSION

code provided in "Sessionization in Python" does not work

Solved!
Herve
Level 4
code provided in "Sessionization in Python" does not work

Trying to go thru Python and Dataiku DSS tutorial, Sessionization in SQL, Hive, Pig and Python, I get the following error

Job failed: Error in Python process: At line 18: <type 'exceptions.TypeError'>: unsupported operand type(s) for -: 'str' and 'float'

when executing python code 

 

# -------------------------------------------------------------------------------- NOTEBOOK-CELL: CODE
import dataiku
import pandas as pd
from datetime import timedelta

# define treshold value
T = timedelta(seconds=30*60)

# load dataset
toy_data = dataiku.Dataset("toy_data").get_dataframe()

# add a column containing previous timestamp
toy_data = pd.concat([toy_data, toy_data.groupby('user_id').transform(lambda x:x.shift(1))],axis=1)
toy_data.columns = ['user_id','mytimestamp','prev_mytimestamp']

# -------------------------------------------------------------------------------- NOTEBOOK-CELL: CODE
# create the new session column
toy_data['new_session'] = ((toy_data['mytimestamp'] - toy_data['prev_mytimestamp'])>=T).astype(int)

# -------------------------------------------------------------------------------- NOTEBOOK-CELL: CODE
# create the session_id
toy_data['increment'] = toy_data.groupby("user_id")['new_session'].cumsum()
toy_data['session_id'] = toy_data['user_id'].astype(str) + '_' + toy_data['increment'].astype(str)

# -------------------------------------------------------------------------------- NOTEBOOK-CELL: CODE
# to get the same result as with hive/postgresql
toy_data = toy_data.sort(['user_id','mytimestamp'])

# -------------------------------------------------------------------------------- NOTEBOOK-CELL: CODE
# Write recipe outputs
sessionization_in_Python = dataiku.Dataset("Sessionization_in_Python")
sessionization_in_Python.write_with_schema(toy_data)

0 Kudos
1 Solution
Herve
Level 4
Author

Just sharing working code :

 

# -------------------------------------------------------------------------------- NOTEBOOK-CELL: CODE
import dataiku
import pandas as pd
import datetime
from datetime import timedelta

# define treshold value
T = timedelta(seconds=30*60)

# load dataset
toy_data = dataiku.Dataset("toy_data").get_dataframe()

# -------------------------------------------------------------------------------- NOTEBOOK-CELL: CODE
# add a column containing previous timestamp
toy_data = pd.concat([toy_data, toy_data.transform(lambda x:x.shift(1))],axis=1)

# -------------------------------------------------------------------------------- NOTEBOOK-CELL: CODE
toy_data.columns = ['user_id','mytimestamp','dup_user_id','prev_mytimestamp']

# -------------------------------------------------------------------------------- NOTEBOOK-CELL: CODE
toy_data.drop(toy_data.columns[2], axis=1,inplace=True)

# -------------------------------------------------------------------------------- NOTEBOOK-CELL: CODE
# create the new session column
toy_data['new_session'] = [0]*16
for i in range(1,16):
toy_data['new_session'][i] = int(datetime.datetime.strptime(toy_data['mytimestamp'][i] , '%Y-%m-%dT%H:%M:%S') - datetime.datetime.strptime(toy_data['prev_mytimestamp'][i], '%Y-%m-%dT%H:%M:%S')>=T)

# -------------------------------------------------------------------------------- NOTEBOOK-CELL: CODE
# create the session_id
toy_data['increment'] = toy_data.groupby("user_id")['new_session'].cumsum()

# -------------------------------------------------------------------------------- NOTEBOOK-CELL: CODE
toy_data['session_id'] = toy_data['user_id'].astype(str) + '_' + toy_data['increment'].astype(str)

# -------------------------------------------------------------------------------- NOTEBOOK-CELL: CODE
# to get the same result as with hive/postgresql
toy_data = toy_data.sort_values(by=['user_id','mytimestamp'])

# -------------------------------------------------------------------------------- NOTEBOOK-CELL: CODE
print toy_data

# -------------------------------------------------------------------------------- NOTEBOOK-CELL: CODE
# Write recipe outputs
sessionization_in_Python = dataiku.Dataset("Sessionization_in_Python")
sessionization_in_Python.write_with_schema(toy_data)

View solution in original post

0 Kudos
1 Reply
Herve
Level 4
Author

Just sharing working code :

 

# -------------------------------------------------------------------------------- NOTEBOOK-CELL: CODE
import dataiku
import pandas as pd
import datetime
from datetime import timedelta

# define treshold value
T = timedelta(seconds=30*60)

# load dataset
toy_data = dataiku.Dataset("toy_data").get_dataframe()

# -------------------------------------------------------------------------------- NOTEBOOK-CELL: CODE
# add a column containing previous timestamp
toy_data = pd.concat([toy_data, toy_data.transform(lambda x:x.shift(1))],axis=1)

# -------------------------------------------------------------------------------- NOTEBOOK-CELL: CODE
toy_data.columns = ['user_id','mytimestamp','dup_user_id','prev_mytimestamp']

# -------------------------------------------------------------------------------- NOTEBOOK-CELL: CODE
toy_data.drop(toy_data.columns[2], axis=1,inplace=True)

# -------------------------------------------------------------------------------- NOTEBOOK-CELL: CODE
# create the new session column
toy_data['new_session'] = [0]*16
for i in range(1,16):
toy_data['new_session'][i] = int(datetime.datetime.strptime(toy_data['mytimestamp'][i] , '%Y-%m-%dT%H:%M:%S') - datetime.datetime.strptime(toy_data['prev_mytimestamp'][i], '%Y-%m-%dT%H:%M:%S')>=T)

# -------------------------------------------------------------------------------- NOTEBOOK-CELL: CODE
# create the session_id
toy_data['increment'] = toy_data.groupby("user_id")['new_session'].cumsum()

# -------------------------------------------------------------------------------- NOTEBOOK-CELL: CODE
toy_data['session_id'] = toy_data['user_id'].astype(str) + '_' + toy_data['increment'].astype(str)

# -------------------------------------------------------------------------------- NOTEBOOK-CELL: CODE
# to get the same result as with hive/postgresql
toy_data = toy_data.sort_values(by=['user_id','mytimestamp'])

# -------------------------------------------------------------------------------- NOTEBOOK-CELL: CODE
print toy_data

# -------------------------------------------------------------------------------- NOTEBOOK-CELL: CODE
# Write recipe outputs
sessionization_in_Python = dataiku.Dataset("Sessionization_in_Python")
sessionization_in_Python.write_with_schema(toy_data)

0 Kudos

Labels

?
Labels (1)
A banner prompting to get Dataiku