diff --git a/data/operation_dulce/dataset.zip b/data/operation_dulce/dataset.zip
index d8a5123b..0e9d37a5 100644
Binary files a/data/operation_dulce/dataset.zip and b/data/operation_dulce/dataset.zip differ
diff --git a/posts/config/custom/index.html b/posts/config/custom/index.html
index 3a0d82ae..d46c08a7 100644
--- a/posts/config/custom/index.html
+++ b/posts/config/custom/index.html
@@ -389,7 +389,8 @@ a {
> input
-
type: The input type field discriminates between the different input types. Options are csv and text.
+
type: The type of input to use. Options are file or blob.
+
file_type: The file type field discriminates between the different input types. Options are csv and text.
base_dir: The base directory to read the input files from. This is relative to the config file.
file_pattern: A regex to match the input files. The regex must have named groups for each of the fields in the file_filter.
post_process: A DataShaper workflow definition to apply to the input before executing the primary workflow.
@@ -400,8 +401,9 @@ a {
-
input:
- type: csv
+
input:
+ type: file
+ file_type: csv
base_dir: ../data/csv # the directory containing the CSV files, this is relative to the config filefile_pattern:'.*[\/](?P<source>[^\/]+)[\/](?P<year>\d{4})-(?P<month>\d{2})-(?P<day>\d{2})_(?P<author>[^_]+)_\d+\.csv$'# a regex to match the CSV files# An additional file filter which uses the named groups from the file_pattern to further filter the files
@@ -420,14 +422,15 @@ a {
column:"title",value:"My document"
-
-
input:
- type: text
+
input:
+ type: file
+ file_type: csv
base_dir: ../data/csv # the directory containing the CSV files, this is relative to the config filefile_pattern:'.*[\/](?P<source>[^\/]+)[\/](?P<year>\d{4})-(?P<month>\d{2})-(?P<day>\d{2})_(?P<author>[^_]+)_\d+\.csv$'# a regex to match the CSV files# An additional file filter which uses the named groups from the file_pattern to further filter the files
@@ -442,7 +445,7 @@ a {
column:"title",value:"My document"
-
+
diff --git a/posts/config/env_vars/index.html b/posts/config/env_vars/index.html
index e571d4b0..6d900a17 100644
--- a/posts/config/env_vars/index.html
+++ b/posts/config/env_vars/index.html
@@ -648,7 +648,7 @@ a {
Input Settings
These settings control the data input used by the pipeline. Any settings with a fallback will use the base LLM settings, if available.
-
Plaintext Input Data (GRAPHRAG_INPUT_TYPE=text)
+
Plaintext Input Data (GRAPHRAG_INPUT_FILE_TYPE=text)
@@ -669,7 +669,7 @@ a {
-
CSV Input Data (GRAPHRAG_INPUT_TYPE=csv)
+
CSV Input Data (GRAPHRAG_INPUT_FILE_TYPE=csv)
@@ -682,6 +682,13 @@ a {
+
GRAPHRAG_INPUT_TYPE
+
The input storage type to use when reading files. (file or blob)
+
str
+
optional
+
file
+
+
GRAPHRAG_INPUT_FILE_PATTERN
The file pattern regexp to use when reading input files from the input directory.
str
@@ -731,13 +738,6 @@ a {
title
-
GRAPHRAG_INPUT_STORAGE_TYPE
-
The storage type to use when reading CSV input files. (file or blob)
-
str
-
optional
-
file
-
-
GRAPHRAG_INPUT_STORAGE_ACCOUNT_BLOB_URL
The Azure Storage blob endpoint to use when in blob mode and using managed identity. Will have the format https://<storage_account_name>.blob.core.windows.net