Icon

kn_​example_​python_​tar_​gzip_​csv

Extract CSV data from .tar.gz file and import to KNIME

Extract CSV data from .tar.gz file and import to KNIME


Extract CSV data from .tar.gz file and import to KNIMEhttps://forum.knime.com/t/gzipped-csv-reader-without-saving-to-hdd/65425/7?u=mlauber71 import knime.scripting.io as knioimport tarfileimport numpy as npimport pandas as pdcsv_file = knio.flow_variables['File path']output_file = knio.flow_variables['context.workflow.data-path'] + knio.flow_variables['File name'] + ".tar.gz"knio.flow_variables['v_tar_gz_file'] = output_filewith tarfile.open(output_file, 'w:gz') as tar: tar.add(csv_file) import knime.scripting.io as knioimport tarfileimport pandas as pdfrom io import StringIOimport numpy as npinput_file = knio.flow_variables['v_tar_gz_file']# csv_file = knio.flow_variables['File name'] + ".tar"# Extract the CSV file from the tar gzip archivewith tarfile.open(input_file, 'r:gz') as tar: csv_file_in_archive = None for member in tar.getmembers(): if member.name.endswith('.csv'): csv_file_in_archive = member.name break if csv_file_in_archive: csv_file_obj = tar.extractfile(csv_file_in_archive) csv_content = csv_file_obj.read().decode() else: raise FileNotFoundError("No CSV file found in the tar gzip archive.")# Load the CSV content into a pandas DataFramecsv_buffer = StringIO(csv_content)df = pd.read_csv(csv_buffer, sep='|')knio.output_tables[0] = knio.Table.from_pandas(df) import knime.scripting.io as knioimport tarfileimport osimport numpy as npimport pandas as pdinput_file = knio.flow_variables['v_tar_gz_file']output_directory = knio.flow_variables['context.workflow.data-path'] + "extracted" + knio.flow_variables['path.separator.system']knio.flow_variables['v_output_directory'] = output_directory# Create the output directory if it doesn't existos.makedirs(output_directory, exist_ok=True)with tarfile.open(input_file, 'r:gz') as tar: for member in tar.getmembers(): if member.name.endswith('.csv'): # Construct the output file path output_file_path = os.path.join(output_directory, os.path.basename(member.name)) # Extract the CSV file to the output directory with open(output_file_path, 'wb') as f_out: f_out.write(tar.extractfile(member).read()) locate and create/data/ folderwith absolute pathslarge_file.csvv_large_csv_file=> the path where the file is being storedv_large_* with tarfile.open(output_file, 'w:gz') as tar: tar.add(csv_file)=> create .TAR.GZ file with bundled Python versionextract all CSV Files fromlarge_file.tar.gzto /data/extracted/extract one CSV file from.tar.gz file and extract as dataframelist all CSV filesin "v_output_directory"v_output_directoryCollect LocalMetadata CSV Writer Path to String(Variable) Path to URI URL to File Path Variable toTable Column Table Rowto Variable Python Script Create CSV File Python Script Python Script List Files/Folders String to Path(Variable) Extract CSV data from .tar.gz file and import to KNIMEhttps://forum.knime.com/t/gzipped-csv-reader-without-saving-to-hdd/65425/7?u=mlauber71 import knime.scripting.io as knioimport tarfileimport numpy as npimport pandas as pdcsv_file = knio.flow_variables['File path']output_file = knio.flow_variables['context.workflow.data-path'] + knio.flow_variables['File name'] + ".tar.gz"knio.flow_variables['v_tar_gz_file'] = output_filewith tarfile.open(output_file, 'w:gz') as tar: tar.add(csv_file) import knime.scripting.io as knioimport tarfileimport pandas as pdfrom io import StringIOimport numpy as npinput_file = knio.flow_variables['v_tar_gz_file']# csv_file = knio.flow_variables['File name'] + ".tar"# Extract the CSV file from the tar gzip archivewith tarfile.open(input_file, 'r:gz') as tar: csv_file_in_archive = None for member in tar.getmembers(): if member.name.endswith('.csv'): csv_file_in_archive = member.name break if csv_file_in_archive: csv_file_obj = tar.extractfile(csv_file_in_archive) csv_content = csv_file_obj.read().decode() else: raise FileNotFoundError("No CSV file found in the tar gzip archive.")# Load the CSV content into a pandas DataFramecsv_buffer = StringIO(csv_content)df = pd.read_csv(csv_buffer, sep='|')knio.output_tables[0] = knio.Table.from_pandas(df) import knime.scripting.io as knioimport tarfileimport osimport numpy as npimport pandas as pdinput_file = knio.flow_variables['v_tar_gz_file']output_directory = knio.flow_variables['context.workflow.data-path'] + "extracted" + knio.flow_variables['path.separator.system']knio.flow_variables['v_output_directory'] = output_directory# Create the output directory if it doesn't existos.makedirs(output_directory, exist_ok=True)with tarfile.open(input_file, 'r:gz') as tar: for member in tar.getmembers(): if member.name.endswith('.csv'): # Construct the output file path output_file_path = os.path.join(output_directory, os.path.basename(member.name)) # Extract the CSV file to the output directory with open(output_file_path, 'wb') as f_out: f_out.write(tar.extractfile(member).read()) locate and create/data/ folderwith absolute pathslarge_file.csvv_large_csv_file=> the path where the file is being storedv_large_* with tarfile.open(output_file, 'w:gz') as tar: tar.add(csv_file)=> create .TAR.GZ file with bundled Python versionextract all CSV Files fromlarge_file.tar.gzto /data/extracted/extract one CSV file from.tar.gz file and extract as dataframelist all CSV filesin "v_output_directory"v_output_directoryCollect LocalMetadata CSV Writer Path to String(Variable) Path to URI URL to File Path Variable toTable Column Table Rowto Variable Python Script Create CSV File Python Script Python Script List Files/Folders String to Path(Variable)

Nodes

Extensions

Links