Sign up to take part
Registered users can ask their own questions, contribute to discussions, and be part of the Community!
Registered users can ask their own questions, contribute to discussions, and be part of the Community!
when executing python code
# -------------------------------------------------------------------------------- NOTEBOOK-CELL: CODE
import dataiku
import pandas as pd
from datetime import timedelta
# define treshold value
T = timedelta(seconds=30*60)
# load dataset
toy_data = dataiku.Dataset("toy_data").get_dataframe()
# add a column containing previous timestamp
toy_data = pd.concat([toy_data, toy_data.groupby('user_id').transform(lambda x:x.shift(1))],axis=1)
toy_data.columns = ['user_id','mytimestamp','prev_mytimestamp']
# -------------------------------------------------------------------------------- NOTEBOOK-CELL: CODE
# create the new session column
toy_data['new_session'] = ((toy_data['mytimestamp'] - toy_data['prev_mytimestamp'])>=T).astype(int)
# -------------------------------------------------------------------------------- NOTEBOOK-CELL: CODE
# create the session_id
toy_data['increment'] = toy_data.groupby("user_id")['new_session'].cumsum()
toy_data['session_id'] = toy_data['user_id'].astype(str) + '_' + toy_data['increment'].astype(str)
# -------------------------------------------------------------------------------- NOTEBOOK-CELL: CODE
# to get the same result as with hive/postgresql
toy_data = toy_data.sort(['user_id','mytimestamp'])
# -------------------------------------------------------------------------------- NOTEBOOK-CELL: CODE
# Write recipe outputs
sessionization_in_Python = dataiku.Dataset("Sessionization_in_Python")
sessionization_in_Python.write_with_schema(toy_data)
Just sharing working code :
# -------------------------------------------------------------------------------- NOTEBOOK-CELL: CODE
import dataiku
import pandas as pd
import datetime
from datetime import timedelta
# define treshold value
T = timedelta(seconds=30*60)
# load dataset
toy_data = dataiku.Dataset("toy_data").get_dataframe()
# -------------------------------------------------------------------------------- NOTEBOOK-CELL: CODE
# add a column containing previous timestamp
toy_data = pd.concat([toy_data, toy_data.transform(lambda x:x.shift(1))],axis=1)
# -------------------------------------------------------------------------------- NOTEBOOK-CELL: CODE
toy_data.columns = ['user_id','mytimestamp','dup_user_id','prev_mytimestamp']
# -------------------------------------------------------------------------------- NOTEBOOK-CELL: CODE
toy_data.drop(toy_data.columns[2], axis=1,inplace=True)
# -------------------------------------------------------------------------------- NOTEBOOK-CELL: CODE
# create the new session column
toy_data['new_session'] = [0]*16
for i in range(1,16):
toy_data['new_session'][i] = int(datetime.datetime.strptime(toy_data['mytimestamp'][i] , '%Y-%m-%dT%H:%M:%S') - datetime.datetime.strptime(toy_data['prev_mytimestamp'][i], '%Y-%m-%dT%H:%M:%S')>=T)
# -------------------------------------------------------------------------------- NOTEBOOK-CELL: CODE
# create the session_id
toy_data['increment'] = toy_data.groupby("user_id")['new_session'].cumsum()
# -------------------------------------------------------------------------------- NOTEBOOK-CELL: CODE
toy_data['session_id'] = toy_data['user_id'].astype(str) + '_' + toy_data['increment'].astype(str)
# -------------------------------------------------------------------------------- NOTEBOOK-CELL: CODE
# to get the same result as with hive/postgresql
toy_data = toy_data.sort_values(by=['user_id','mytimestamp'])
# -------------------------------------------------------------------------------- NOTEBOOK-CELL: CODE
print toy_data
# -------------------------------------------------------------------------------- NOTEBOOK-CELL: CODE
# Write recipe outputs
sessionization_in_Python = dataiku.Dataset("Sessionization_in_Python")
sessionization_in_Python.write_with_schema(toy_data)
Just sharing working code :
# -------------------------------------------------------------------------------- NOTEBOOK-CELL: CODE
import dataiku
import pandas as pd
import datetime
from datetime import timedelta
# define treshold value
T = timedelta(seconds=30*60)
# load dataset
toy_data = dataiku.Dataset("toy_data").get_dataframe()
# -------------------------------------------------------------------------------- NOTEBOOK-CELL: CODE
# add a column containing previous timestamp
toy_data = pd.concat([toy_data, toy_data.transform(lambda x:x.shift(1))],axis=1)
# -------------------------------------------------------------------------------- NOTEBOOK-CELL: CODE
toy_data.columns = ['user_id','mytimestamp','dup_user_id','prev_mytimestamp']
# -------------------------------------------------------------------------------- NOTEBOOK-CELL: CODE
toy_data.drop(toy_data.columns[2], axis=1,inplace=True)
# -------------------------------------------------------------------------------- NOTEBOOK-CELL: CODE
# create the new session column
toy_data['new_session'] = [0]*16
for i in range(1,16):
toy_data['new_session'][i] = int(datetime.datetime.strptime(toy_data['mytimestamp'][i] , '%Y-%m-%dT%H:%M:%S') - datetime.datetime.strptime(toy_data['prev_mytimestamp'][i], '%Y-%m-%dT%H:%M:%S')>=T)
# -------------------------------------------------------------------------------- NOTEBOOK-CELL: CODE
# create the session_id
toy_data['increment'] = toy_data.groupby("user_id")['new_session'].cumsum()
# -------------------------------------------------------------------------------- NOTEBOOK-CELL: CODE
toy_data['session_id'] = toy_data['user_id'].astype(str) + '_' + toy_data['increment'].astype(str)
# -------------------------------------------------------------------------------- NOTEBOOK-CELL: CODE
# to get the same result as with hive/postgresql
toy_data = toy_data.sort_values(by=['user_id','mytimestamp'])
# -------------------------------------------------------------------------------- NOTEBOOK-CELL: CODE
print toy_data
# -------------------------------------------------------------------------------- NOTEBOOK-CELL: CODE
# Write recipe outputs
sessionization_in_Python = dataiku.Dataset("Sessionization_in_Python")
sessionization_in_Python.write_with_schema(toy_data)