Icon

kn_​forum_​62130_​excel_​large_​file_​python

import a large Excel file with 2,000 columns and 25,000 lines - 300 MB size

import a large Excel file with 2,000 columns and 25,000 lines - 300 MB size



import knime.scripting.io as knioimport pandas as pdv_excel_file = knio.flow_variables['file_path']v_name_sheet = knio.flow_variables['Name_Excel_sheet']df = pd.read_excel(v_excel_file, sheet_name=v_name_sheet, engine='openpyxl')knio.output_tables[0] = knio.Table.from_pandas(df) import knime.scripting.io as knioimport openpyxlfrom openpyxl.utils import get_column_letterfrom datetime import datetime# Create a new workbookworkbook = openpyxl.Workbook()# Select the active worksheetworksheet = workbook.active# Define the number of columns and rowsnum_columns = 2000num_rows = 25000# Define column headersheaders = [f"Column {get_column_letter(j)}" for j in range(1, num_columns+1)]# Add the headers to the worksheetworksheet.append(headers)# Freeze the top row with the headersworksheet.freeze_panes = 'A2'# Fill the worksheet with datafor i in range(1, num_rows+1): row_data = [] for j in range(1, num_columns+1): if j % 3 == 0: # Add a number to every third column row_data.append(j) elif j % 5 == 0: # Add a date to every fifth column row_data.append(datetime.now()) else: # Add a string to all other columns row_data.append(f"Row {i}, Column {get_column_letter(j)}") for j, value in enumerate(row_data, start=1): worksheet.cell(row=i+1, column=j, value=value)var_excel_path = knio.flow_variables['xlsx_path_location']# Save the workbookworkbook.save(var_excel_path) import a large Excel file with 2,000 columns and 25,000 lines - 300 MB sizehttps://forum.knime.com/t/increased-excel-file-size-and-excel-writer-sheet-append-issue-with-knime-4-6-0/62130/5?u=mlauber71 import knime.scripting.io as knioimport numpy as npimport pandas as pdimport openpyxlfrom openpyxl.utils.dataframe import dataframe_to_rowsdf= knio.input_tables[0].to_pandas()# Reset the index to a range of sequential integersdf = df.reset_index(drop=True)df.index += 1# Set the new index as a long integerdf.index = df.index.astype('int64')var_excel_file = knio.flow_variables['xlsx_path_location']var_excel_sheet = knio.flow_variables['var_excel_sheet_new']# https://stackoverflow.com/a/75162871# with the help of some ChatGPTdef df_to_excel(df, ws, header=True, index=True, startrow=0, startcol=0):"""Write DataFrame df to openpyxl worksheet ws""" values = df.values nrow, ncol = values.shape # Write the headers to the worksheet if header: for c_idx, value in enumerate(df.columns, startcol + 1): ws.cell(row=startrow + 1, column=c_idx).value = value # Write the data rows to the worksheet for r_idx, row in enumerate(values, startrow + 2): for c_idx, value in enumerate(row, startcol + 1): ws.cell(row=r_idx, column=c_idx).value = value# Load the workbook and sheetwb = openpyxl.load_workbook(var_excel_file)# Check if the sheet already exists, and remove it if it doesif var_excel_sheet in wb.sheetnames: std = wb[var_excel_sheet] wb.remove(std)ws = wb.create_sheet(var_excel_sheet)# Write the dataframe to the worksheetdf_to_excel(df, ws)# Save the workbookwb.save(var_excel_file)wb.close() locate and create/data/ folderwith absolute pathscreate a verylarge excel filexlsx_pathxlsx_path=> import Excel sheet with openpyxlName_Excel_sheetreturn "Sheet";Execute failed: ZIP entry size is too large or invalidAppend data to an existingExcel File (xlsx_path_location)into [var_excel_sheet_new](if exists overwrite)in this example the filewill have something like 600 MBvar_excel_sheet_new"Sheet2" Collect LocalMetadata Python Script Create File/FolderVariables Path to String(Variable) Python Script URL to FilePath (Variable) Java EditVariable (simple) Excel Reader Python Script Java EditVariable (simple) import knime.scripting.io as knioimport pandas as pdv_excel_file = knio.flow_variables['file_path']v_name_sheet = knio.flow_variables['Name_Excel_sheet']df = pd.read_excel(v_excel_file, sheet_name=v_name_sheet, engine='openpyxl')knio.output_tables[0] = knio.Table.from_pandas(df) import knime.scripting.io as knioimport openpyxlfrom openpyxl.utils import get_column_letterfrom datetime import datetime# Create a new workbookworkbook = openpyxl.Workbook()# Select the active worksheetworksheet = workbook.active# Define the number of columns and rowsnum_columns = 2000num_rows = 25000# Define column headersheaders = [f"Column {get_column_letter(j)}" for j in range(1, num_columns+1)]# Add the headers to the worksheetworksheet.append(headers)# Freeze the top row with the headersworksheet.freeze_panes = 'A2'# Fill the worksheet with datafor i in range(1, num_rows+1): row_data = [] for j in range(1, num_columns+1): if j % 3 == 0: # Add a number to every third column row_data.append(j) elif j % 5 == 0: # Add a date to every fifth column row_data.append(datetime.now()) else: # Add a string to all other columns row_data.append(f"Row {i}, Column {get_column_letter(j)}") for j, value in enumerate(row_data, start=1): worksheet.cell(row=i+1, column=j, value=value)var_excel_path = knio.flow_variables['xlsx_path_location']# Save the workbookworkbook.save(var_excel_path) import a large Excel file with 2,000 columns and 25,000 lines - 300 MB sizehttps://forum.knime.com/t/increased-excel-file-size-and-excel-writer-sheet-append-issue-with-knime-4-6-0/62130/5?u=mlauber71 import knime.scripting.io as knioimport numpy as npimport pandas as pdimport openpyxlfrom openpyxl.utils.dataframe import dataframe_to_rowsdf= knio.input_tables[0].to_pandas()# Reset the index to a range of sequential integersdf = df.reset_index(drop=True)df.index += 1# Set the new index as a long integerdf.index = df.index.astype('int64')var_excel_file = knio.flow_variables['xlsx_path_location']var_excel_sheet = knio.flow_variables['var_excel_sheet_new']# https://stackoverflow.com/a/75162871# with the help of some ChatGPTdef df_to_excel(df, ws, header=True, index=True, startrow=0, startcol=0):"""Write DataFrame df to openpyxl worksheet ws""" values = df.values nrow, ncol = values.shape # Write the headers to the worksheet if header: for c_idx, value in enumerate(df.columns, startcol + 1): ws.cell(row=startrow + 1, column=c_idx).value = value # Write the data rows to the worksheet for r_idx, row in enumerate(values, startrow + 2): for c_idx, value in enumerate(row, startcol + 1): ws.cell(row=r_idx, column=c_idx).value = value# Load the workbook and sheetwb = openpyxl.load_workbook(var_excel_file)# Check if the sheet already exists, and remove it if it doesif var_excel_sheet in wb.sheetnames: std = wb[var_excel_sheet] wb.remove(std)ws = wb.create_sheet(var_excel_sheet)# Write the dataframe to the worksheetdf_to_excel(df, ws)# Save the workbookwb.save(var_excel_file)wb.close() locate and create/data/ folderwith absolute pathscreate a verylarge excel filexlsx_pathxlsx_path=> import Excel sheet with openpyxlName_Excel_sheetreturn "Sheet";Execute failed: ZIP entry size is too large or invalidAppend data to an existingExcel File (xlsx_path_location)into [var_excel_sheet_new](if exists overwrite)in this example the filewill have something like 600 MBvar_excel_sheet_new"Sheet2"Collect LocalMetadata Python Script Create File/FolderVariables Path to String(Variable) Python Script URL to FilePath (Variable) Java EditVariable (simple) Excel Reader Python Script Java EditVariable (simple)

Nodes

Extensions

Links