Sign up to take part
Registered users can ask their own questions, contribute to discussions, and be part of the Community!
Added on March 21, 2025 7:52PM
Likes: 4
Replies: 1
This is a question I've asked myself and solved with a little Python code so I thought I'd share. I had a folder with several subfolders, each containing a JPEG for each page of the original PDF (for context this folder is the output of the Greyscale recipe from our Text Extraction plugin). I really only want to parse data/create RAG pipelines from the first page of every file. First I used List Files Recipe on my input folder. That and the input folder are the inputs to my Python recipe. Here is the code I used to do create that subset:
# -------------------------------------------------------------------------------- NOTEBOOK-CELL: CODE # -*- coding: utf-8 -*- import dataiku import pandas as pd, numpy as np from dataiku import pandasutils as pdu import shutil import os # -------------------------------------------------------------------------------- NOTEBOOK-CELL: CODE # Read recipe inputs input_dataset = dataiku.Dataset("jpegs_local_files") #change this to your List Files output df = input_dataset.get_dataframe() input_folder = dataiku.Folder("WT1Fqq9q") #Change to your input folder ID input_folder_path = input_folder.get_path() print(input_folder_path) # -------------------------------------------------------------------------------- NOTEBOOK-CELL: CODE #Creating a list of the files I want in my output folder filtered_files = df[df["path"].str.endswith("1.jpg")]["path"].tolist() print(filtered_files) # -------------------------------------------------------------------------------- NOTEBOOK-CELL: CODE output_folder = dataiku.Folder("88GP6J5f") # Change to your output folder ID output_folder_path = output_folder.get_path() print(output_folder_path) # -------------------------------------------------------------------------------- NOTEBOOK-CELL: CODE #Loop through the list of files you want and add their paths to your output folder path
#Note this code will need a different method if your output folder is not stored in the managed filesystem
for file_path in filtered_files: source_path = os.path.join(input_folder_path, file_path.lstrip("/")) destination_path = os.path.join(output_folder_path, os.path.basename(file_path)) # Destination path print(f"Checking file: {source_path}") if os.path.exists(source_path): # Ensure the file exists shutil.copy(source_path, destination_path) print(f"Copied: {file_path} → {destination_path}")
The above code only works when you are using local folders (commonly the managed_filesystem connection). If you are using blob storage (like s3) the get_path() method will not work. Here is code that works for blob-backed folders:
# -------------------------------------------------------------------------------- NOTEBOOK-CELL: CODE # -*- coding: utf-8 -*- import dataiku import pandas as pd, numpy as np from dataiku import pandasutils as pdu import shutil import os # -------------------------------------------------------------------------------- NOTEBOOK-CELL: CODE # Read recipe inputs input_folder = dataiku.Folder("DKiJy3eW") #←change this your folder id" # List all files in the S3 input folder input_files = input_folder.list_paths_in_partition() print("Files in input folder:", input_files) # -------------------------------------------------------------------------------- NOTEBOOK-CELL: CODE # Filter files that end with "01.jpg" (page 1) filtered_files = [file for file in input_files if file.endswith("01.jpg")] #← change this to the pattern you want print("Filtered files:", filtered_files) # -------------------------------------------------------------------------------- NOTEBOOK-CELL: CODE output_folder = dataiku.Folder("TfKzsauV") # Change to your output folder ID # -------------------------------------------------------------------------------- NOTEBOOK-CELL: CODE # Copy selected files from S3 input folder to S3 output folder for file_path in filtered_files: print(f"Processing file: {file_path}") # Extract filename (remove folder structure) filename = file_path.split("/")[-1] # Read file from S3 as a stream with input_folder.get_download_stream(file_path) as stream: # Upload the stream to S3 output folder output_folder.upload_stream(filename, stream) print(f"Copied: {file_path} → {filename} in output folder")