code provided in "Sessionization in Python" does not work

Solved!
Herve
Level 4
code provided in "Sessionization in Python" does not work

Trying to go thru Python and Dataiku DSS tutorial, Sessionization in SQL, Hive, Pig and Python, I get the following error

Job failed: Error in Python process: At line 18: <type 'exceptions.TypeError'>: unsupported operand type(s) for -: 'str' and 'float'

when executing python code 

 

# -------------------------------------------------------------------------------- NOTEBOOK-CELL: CODE
import dataiku
import pandas as pd
from datetime import timedelta

# define treshold value
T = timedelta(seconds=30*60)

# load dataset
toy_data = dataiku.Dataset("toy_data").get_dataframe()

# add a column containing previous timestamp
toy_data = pd.concat([toy_data, toy_data.groupby('user_id').transform(lambda x:x.shift(1))],axis=1)
toy_data.columns = ['user_id','mytimestamp','prev_mytimestamp']

# -------------------------------------------------------------------------------- NOTEBOOK-CELL: CODE
# create the new session column
toy_data['new_session'] = ((toy_data['mytimestamp'] - toy_data['prev_mytimestamp'])>=T).astype(int)

# -------------------------------------------------------------------------------- NOTEBOOK-CELL: CODE
# create the session_id
toy_data['increment'] = toy_data.groupby("user_id")['new_session'].cumsum()
toy_data['session_id'] = toy_data['user_id'].astype(str) + '_' + toy_data['increment'].astype(str)

# -------------------------------------------------------------------------------- NOTEBOOK-CELL: CODE
# to get the same result as with hive/postgresql
toy_data = toy_data.sort(['user_id','mytimestamp'])

# -------------------------------------------------------------------------------- NOTEBOOK-CELL: CODE
# Write recipe outputs
sessionization_in_Python = dataiku.Dataset("Sessionization_in_Python")
sessionization_in_Python.write_with_schema(toy_data)

0 Kudos
1 Solution
Herve
Level 4
Author

Just sharing working code :

 

# -------------------------------------------------------------------------------- NOTEBOOK-CELL: CODE
import dataiku
import pandas as pd
import datetime
from datetime import timedelta

# define treshold value
T = timedelta(seconds=30*60)

# load dataset
toy_data = dataiku.Dataset("toy_data").get_dataframe()

# -------------------------------------------------------------------------------- NOTEBOOK-CELL: CODE
# add a column containing previous timestamp
toy_data = pd.concat([toy_data, toy_data.transform(lambda x:x.shift(1))],axis=1)

# -------------------------------------------------------------------------------- NOTEBOOK-CELL: CODE
toy_data.columns = ['user_id','mytimestamp','dup_user_id','prev_mytimestamp']

# -------------------------------------------------------------------------------- NOTEBOOK-CELL: CODE
toy_data.drop(toy_data.columns[2], axis=1,inplace=True)

# -------------------------------------------------------------------------------- NOTEBOOK-CELL: CODE
# create the new session column
toy_data['new_session'] = [0]*16
for i in range(1,16):
toy_data['new_session'][i] = int(datetime.datetime.strptime(toy_data['mytimestamp'][i] , '%Y-%m-%dT%H:%M:%S') - datetime.datetime.strptime(toy_data['prev_mytimestamp'][i], '%Y-%m-%dT%H:%M:%S')>=T)

# -------------------------------------------------------------------------------- NOTEBOOK-CELL: CODE
# create the session_id
toy_data['increment'] = toy_data.groupby("user_id")['new_session'].cumsum()

# -------------------------------------------------------------------------------- NOTEBOOK-CELL: CODE
toy_data['session_id'] = toy_data['user_id'].astype(str) + '_' + toy_data['increment'].astype(str)

# -------------------------------------------------------------------------------- NOTEBOOK-CELL: CODE
# to get the same result as with hive/postgresql
toy_data = toy_data.sort_values(by=['user_id','mytimestamp'])

# -------------------------------------------------------------------------------- NOTEBOOK-CELL: CODE
print toy_data

# -------------------------------------------------------------------------------- NOTEBOOK-CELL: CODE
# Write recipe outputs
sessionization_in_Python = dataiku.Dataset("Sessionization_in_Python")
sessionization_in_Python.write_with_schema(toy_data)

View solution in original post

0 Kudos
1 Reply
Herve
Level 4
Author

Just sharing working code :

 

# -------------------------------------------------------------------------------- NOTEBOOK-CELL: CODE
import dataiku
import pandas as pd
import datetime
from datetime import timedelta

# define treshold value
T = timedelta(seconds=30*60)

# load dataset
toy_data = dataiku.Dataset("toy_data").get_dataframe()

# -------------------------------------------------------------------------------- NOTEBOOK-CELL: CODE
# add a column containing previous timestamp
toy_data = pd.concat([toy_data, toy_data.transform(lambda x:x.shift(1))],axis=1)

# -------------------------------------------------------------------------------- NOTEBOOK-CELL: CODE
toy_data.columns = ['user_id','mytimestamp','dup_user_id','prev_mytimestamp']

# -------------------------------------------------------------------------------- NOTEBOOK-CELL: CODE
toy_data.drop(toy_data.columns[2], axis=1,inplace=True)

# -------------------------------------------------------------------------------- NOTEBOOK-CELL: CODE
# create the new session column
toy_data['new_session'] = [0]*16
for i in range(1,16):
toy_data['new_session'][i] = int(datetime.datetime.strptime(toy_data['mytimestamp'][i] , '%Y-%m-%dT%H:%M:%S') - datetime.datetime.strptime(toy_data['prev_mytimestamp'][i], '%Y-%m-%dT%H:%M:%S')>=T)

# -------------------------------------------------------------------------------- NOTEBOOK-CELL: CODE
# create the session_id
toy_data['increment'] = toy_data.groupby("user_id")['new_session'].cumsum()

# -------------------------------------------------------------------------------- NOTEBOOK-CELL: CODE
toy_data['session_id'] = toy_data['user_id'].astype(str) + '_' + toy_data['increment'].astype(str)

# -------------------------------------------------------------------------------- NOTEBOOK-CELL: CODE
# to get the same result as with hive/postgresql
toy_data = toy_data.sort_values(by=['user_id','mytimestamp'])

# -------------------------------------------------------------------------------- NOTEBOOK-CELL: CODE
print toy_data

# -------------------------------------------------------------------------------- NOTEBOOK-CELL: CODE
# Write recipe outputs
sessionization_in_Python = dataiku.Dataset("Sessionization_in_Python")
sessionization_in_Python.write_with_schema(toy_data)

0 Kudos

Labels

?
Labels (1)
A banner prompting to get Dataiku