Data Overwrite Issue in BigQuery Using Dataiku with Python

Guillaume5 · May 19

Hello everyone,

I'm currently developing a Python script in Dataiku and encountering an issue related to data overwrite in BigQuery. Here's some context:

In my project, we're importing data from Teradata into BigQuery using Dataiku. The goal is to perform incremental updates, meaning adding new records for the last X days while preserving the existing data in BigQuery.

The process generally works, but whenever I use the write_with_schema method in Dataiku to insert data into the dataset linked to BigQuery, it seems to overwrite all the existing data, effectively resetting the table rather than appending the new data.

I'm looking for a solution to prevent this complete overwrite and enable an append mode. Has anyone here encountered similar issues or have suggestions on how to resolve this?

I would greatly appreciate any advice or direction that could help me configure the dataset correctly to preserve previously imported data in BigQuery.

Thank you in advance for your help and suggestions.

dataset settings:

{
"type":"BigQuery",
"managed":false,
"featureGroup":false,
"name":"TEST_GJA",
"smartName":"TEST_GJA",
"projectKey":"PYTHONSANDBOX",
"checklists":{
"checklists":[

},
"checks":[

],
"customMeta":{
"kv":{

},
"flowOptions":{
"virtualizable":false,
"rebuildBehavior":"NORMAL",
"crossProjectBuildBehavior":"DEFAULT"
},
"readWriteOptions":{
"preserveOrder":false,
"writeBuckets":1,
"forceSingleOutputFile":false,
"defaultReadOrdering":{
"enabled":false,
"rules":[

     ]

  }

},
"partitioning":{
"ignoreNonMatchingFile":false,
"considerMissingRequestedPartitionsAsEmpty":false,
"dimensions":[

},
"versionTag":{
"versionNumber":1,
"lastModifiedBy":{
"login":""
},
"lastModifiedOn":1747409330634
},
"creationTag":{
"versionNumber":0,
"lastModifiedBy":{
"login":""
},
"lastModifiedOn":1747404831835
},
"tags":[

],
"params":{
"forbidPreviewFallbackToSelect":false,
"requiresPartitionFilter":false,
"useBigQueryPartitioning":false,
"bigQueryPartitioningType":"DATE",
"bigQueryPartitioningRangeStart":0,
"bigQueryPartitioningRangeEnd":0,
"bigQueryPartitioningRangeInterval":0,
"connection":"",
"notReadyIfEmpty":false,
"mode":"table",
"partitioningType":"custom",
"normalizeDoubles":true,
"catalog":"",
"table":"TEST_GJA",
"schema":"travail",
"tableCreationMode":"auto",
"writeInsertBatchSize":10000,
"writeDescriptionsAsSQLComment":false,
"noDropOnSchemaMismatch":true,
"writeJDBCBadDataBehavior":"DISCARD_ROW",
"datetimenotzReadMode":"AS_IS",
"dateonlyReadMode":"AS_IS",
"variablesExpansionLoopConfig":{
"enabled":false,
"mode":"CREATE_VARIABLE_FOR_EACH_COLUMN",
"replacements":[

     ]

  },

  "sparkJdbcAccess":"NONE",

  "numPartitions":-1

},
"schema":{
"columns":[
{
"name":"SITE",
"type":"string",
"maxLength":16200
},
{
"name":"NB_CDC",
"type":"bigint"
},
{
"name":"DT_INTE",
"type":"date"
}
],
"userModified":true
},
"typeSystemVersion":"V2",
"dkuProperties":[

],
"metrics":{
"probes":[
{
"type":"basic",
"enabled":true,
"computeOnBuildMode":"PARTITION",
"meta":{
"name":"Basic data",
"level":0
},
"configuration":{

        }

     },

     {

        "type":"records",

        "enabled":true,

        "computeOnBuildMode":"NO",

        "meta":{

           "name":"Record count",

           "level":0

        },

        "configuration":{


        }

     }

  ],

  "engineConfig":{

     "padRunsWithMetrics":false,

     "hive":{

        "active":true,

        "extraConf":[


        ]

     },

     "basic":{


     },

     "dss":{

        "active":true,

        "selection":{

           "useMemTable":false,

           "filter":{

              "distinct":false,

              "enabled":false

           },

           "partitionSelectionMethod":"ALL",

           "latestPartitionsN":1,

           "ordering":{

              "enabled":false,

              "rules":[


              ]

           },

           "samplingMethod":"FULL",

           "maxRecords":-1,

           "targetRatio":0.02,

           "ascending":true,

           "withinFirstN":-1,

           "maxReadUncompressedBytes":-1

        }

     },

     "sql":{

        "active":true

     },

     "impala":{

        "active":true

     },

     "spark":{

        "active":true,

        "extraConf":[


        ]

     },

     "python":{


     }

  },

  "displayedState":{

     "columns":[


     ],

     "metrics":[

        "basic:COUNT_COLUMNS",

        "records:COUNT_RECORDS"

     ]

  }

},
"metricsChecks":{
"monitor":false,
"checks":[

  ],

  "displayedState":{

     "checks":[


     ]

  }

},
"customFields":{

}
}

Operating system used: windows 10

Turribeach · May 19

In the Inputs/Outputs tab of your Python recipe enable the "Append instead of overwrite" option in the output section. Then in your Dataset ⇒ Settings ⇒ Advanced under SQL schema inconsistency make sure "Prevent data drop in append mode" is enabled to protect the table from being dropped if the outputs of your recipe change and don't match your output dataset.

Data Overwrite Issue in BigQuery Using Dataiku with Python

Answers

Categories

Setup Info

Tags