Icon

kn_​example_​r_​read_​parquet_​file

use R library(arrow) to read parquet file into KNIME

use R library(arrow) to read parquet file into KNIME
Export the data to SQLite, ARFF and again to another Parquet file. Also: split a larger file into multiple parquet files within a folder and handle that with R

use R library(arrow) to read parquet file into KNIMEExport the data to SQLite, ARFF and again to another Parquet file. Also: split a larger file into multiple parquet files within a folder and handle that with R You can also write large files to a folder with several Parquet files and import them into R - and later export it again to a large single Parquet file:https://forum.knime.com/t/knime-assign-failed-request-status-data-overflow-incoming-data-too-big/65232/2?u=mlauber71 library(arrow)library(RSQLite)library(foreign)df <- as.data.frame(read_parquet(knime.flow.in[["var_path_parquet_file"]]))# ----- location of SQLite Database# define some basic variablesvar_loc_sqlite <- paste0(knime.flow.in[["context.workflow.data-path"]], "db.sqlite")con = dbConnect(drv=RSQLite::SQLite(), dbname=var_loc_sqlite)dbWriteTable(con, "test_file", df, overwrite = TRUE)# export as ARFF filevar_loc_arff <- paste0(knime.flow.in[["context.workflow.data-path"]], "test_file.arff")write.arff(df, var_loc_arff, eol = "\n")# export file (again) as parquetvar_loc_parquet <- paste0(knime.flow.in[["context.workflow.data-path"]], "test_file_from_r.parquet")write_parquet(x=df, sink=var_loc_parquet, compression = "gzip", compression_level = 5)# write flow variables back to KNIME workflowknime.flow.out <- list( var_loc_parquet = as.character(var_loc_parquet) )knime.out <- dfdbDisconnect(con) library(arrow)library(data.table)## provide a path to the folder with the collection of the .parquet filesvar_path_parquet_folder <- paste0(knime.flow.in[["context.workflow.data-path"]], "test_folder_parquet")## https://stackoverflow.com/questions/58439966/read-partitioned-parquet-directory-all-files-in-one-r-dataframe-with-apache-ar## Define the datasetDS <- arrow::open_dataset(sources = var_path_parquet_folder)## Create a scannerSO <- Scanner$create(DS)## Load it as n Arrow Table in memoryAT <- SO$ToTable()## Convert it to an R data framedf <- as.data.table(AT)# export file (again) as parquetvar_loc_parquet_large <- paste0(knime.flow.in[["context.workflow.data-path"]], "test_file_from_r_folder.parquet")write_parquet(x=df, sink=var_loc_parquet_large, compression = "gzip", compression_level = 5)# write flow variables back to KNIME workflowknime.flow.out <- list( var_loc_parquet_folder = as.character(var_path_parquet_folder), var_loc_parquet_large = as.character(var_loc_parquet_large) )knime.out <- df locate and create/data/ folderwith absolute pathsdummy dataread Parquet file andexport to KNIME, SQLite and ARFFand again Parquetknime://knime.workflow/data/db.sqlitedefault.test_fileknime://knime.workflow/data/test_file.arffknime://knime.workflow/data/test_file_from_r.parquetwith Path variable from Rvar_loc_*Cluster_Membershipcreate largedummy dataCluster_Membershipread Parquet files from subfolder/data/test_folder_parquet/....and export to a single large Parquet file/data/test_file_from_r_folder.parquetimport all parquet files from the subfolder/data/test_folder_parquet/....var_loc_*import single large Parquet file/data/test_file_from_r_folder.parquetvar_write_mode_parquettest_file.parquet/data/test_folder_parquet/...the folder will be overwrittensplit large data into parts=> note: in a real world scenario you shouldset the split values to a larger default1024 MB and 128 MB Collect LocalMetadata Data Generator R Source (Table) SQLite Connector DB Table Selector DB Reader ARFF Reader determine paths Parquet Reader String to Path(Variable) Column Rename Data Generator Column Rename R Source (Table) Parquet Reader String to Path(Variable) Parquet Reader Java EditVariable (simple) Merge Variables Parquet Writer Parquet Writer use R library(arrow) to read parquet file into KNIMEExport the data to SQLite, ARFF and again to another Parquet file. Also: split a larger file into multiple parquet files within a folder and handle that with R You can also write large files to a folder with several Parquet files and import them into R - and later export it again to a large single Parquet file:https://forum.knime.com/t/knime-assign-failed-request-status-data-overflow-incoming-data-too-big/65232/2?u=mlauber71 library(arrow)library(RSQLite)library(foreign)df <- as.data.frame(read_parquet(knime.flow.in[["var_path_parquet_file"]]))# ----- location of SQLite Database# define some basic variablesvar_loc_sqlite <- paste0(knime.flow.in[["context.workflow.data-path"]], "db.sqlite")con = dbConnect(drv=RSQLite::SQLite(), dbname=var_loc_sqlite)dbWriteTable(con, "test_file", df, overwrite = TRUE)# export as ARFF filevar_loc_arff <- paste0(knime.flow.in[["context.workflow.data-path"]], "test_file.arff")write.arff(df, var_loc_arff, eol = "\n")# export file (again) as parquetvar_loc_parquet <- paste0(knime.flow.in[["context.workflow.data-path"]], "test_file_from_r.parquet")write_parquet(x=df, sink=var_loc_parquet, compression = "gzip", compression_level = 5)# write flow variables back to KNIME workflowknime.flow.out <- list( var_loc_parquet = as.character(var_loc_parquet) )knime.out <- dfdbDisconnect(con) library(arrow)library(data.table)## provide a path to the folder with the collection of the .parquet filesvar_path_parquet_folder <- paste0(knime.flow.in[["context.workflow.data-path"]], "test_folder_parquet")## https://stackoverflow.com/questions/58439966/read-partitioned-parquet-directory-all-files-in-one-r-dataframe-with-apache-ar## Define the datasetDS <- arrow::open_dataset(sources = var_path_parquet_folder)## Create a scannerSO <- Scanner$create(DS)## Load it as n Arrow Table in memoryAT <- SO$ToTable()## Convert it to an R data framedf <- as.data.table(AT)# export file (again) as parquetvar_loc_parquet_large <- paste0(knime.flow.in[["context.workflow.data-path"]], "test_file_from_r_folder.parquet")write_parquet(x=df, sink=var_loc_parquet_large, compression = "gzip", compression_level = 5)# write flow variables back to KNIME workflowknime.flow.out <- list( var_loc_parquet_folder = as.character(var_path_parquet_folder), var_loc_parquet_large = as.character(var_loc_parquet_large) )knime.out <- df locate and create/data/ folderwith absolute pathsdummy dataread Parquet file andexport to KNIME, SQLite and ARFFand again Parquetknime://knime.workflow/data/db.sqlitedefault.test_fileknime://knime.workflow/data/test_file.arffknime://knime.workflow/data/test_file_from_r.parquetwith Path variable from Rvar_loc_*Cluster_Membershipcreate largedummy dataCluster_Membershipread Parquet files from subfolder/data/test_folder_parquet/....and export to a single large Parquet file/data/test_file_from_r_folder.parquetimport all parquet files from the subfolder/data/test_folder_parquet/....var_loc_*import single large Parquet file/data/test_file_from_r_folder.parquetvar_write_mode_parquettest_file.parquet/data/test_folder_parquet/...the folder will be overwrittensplit large data into parts=> note: in a real world scenario you shouldset the split values to a larger default1024 MB and 128 MBCollect LocalMetadata Data Generator R Source (Table) SQLite Connector DB Table Selector DB Reader ARFF Reader determine paths Parquet Reader String to Path(Variable) Column Rename Data Generator Column Rename R Source (Table) Parquet Reader String to Path(Variable) Parquet Reader Java EditVariable (simple) Merge Variables Parquet Writer Parquet Writer

Nodes

Extensions

Links