From 8a6ce3d8a659759ce1ef1cd0083fafbe0f0dbeda Mon Sep 17 00:00:00 2001 From: Angela Simms <102690377+asimms41@users.noreply.github.com> Date: Mon, 9 Sep 2024 16:01:27 +0100 Subject: [PATCH] 2540 Add newly released and updated connectors for RPCN Cloud launch (#39) --- antora.yml | 9 + modules/ROOT/nav.adoc | 5 + .../pages/inputs/kafka_migrator.adoc | 708 +++++++++++++++ .../pages/inputs/kafka_migrator_bundle.adoc | 53 ++ .../pages/outputs/kafka_migrator.adoc | 818 ++++++++++++++++++ .../pages/outputs/kafka_migrator_bundle.adoc | 41 + .../pages/outputs/kafka_migrator_offsets.adoc | 552 ++++++++++++ .../pages/processors/ollama_chat.adoc | 242 +++++- .../pages/processors/ollama_embeddings.adoc | 143 ++- 9 files changed, 2535 insertions(+), 36 deletions(-) create mode 100644 modules/components/pages/inputs/kafka_migrator.adoc create mode 100644 modules/components/pages/inputs/kafka_migrator_bundle.adoc create mode 100644 modules/components/pages/outputs/kafka_migrator.adoc create mode 100644 modules/components/pages/outputs/kafka_migrator_bundle.adoc create mode 100644 modules/components/pages/outputs/kafka_migrator_offsets.adoc diff --git a/antora.yml b/antora.yml index 43da6654..8a74c7f9 100644 --- a/antora.yml +++ b/antora.yml @@ -216,6 +216,9 @@ asciidoc: - "ollama_embeddings" - "aws_bedrock_chat" - "gcp_vertex_ai_chat" + - "kafka_migrator" + - "kafka_migrator_bundle" + - "kafka_migrator_offsets" # Defines the connectors that are marked 'certified'. # This list is a temporary solution until each connector's source code includes the correct support level information. https://github.com/redpanda-data/benthos/pull/21/commits/aba447436e44faf7d4279d3394d82084ea0ed1e1#diff-26839711b0903fcf81e360ea17690bd67eb5e924cbabb62ffe85202c98c2a0a6 certified-components: @@ -279,6 +282,12 @@ asciidoc: types: ["input", "output"] - name: "kafka_franz" types: ["input", "output"] + - name: "kafka_bundle" + types: ["input", "output"] + - name: "kafka_migrator" + types: ["input", "output"] + - name: "kafka_migrator_offsets" + types: ["output"] - name: "log" types: ["processor"] - name: "mapping" diff --git a/modules/ROOT/nav.adoc b/modules/ROOT/nav.adoc index 77c07469..ff6fe7d8 100644 --- a/modules/ROOT/nav.adoc +++ b/modules/ROOT/nav.adoc @@ -48,6 +48,8 @@ *** xref:components:inputs/inproc.adoc[] *** xref:components:inputs/kafka.adoc[] *** xref:components:inputs/kafka_franz.adoc[] +*** xref:components:inputs/kafka_migrator.adoc[] +*** xref:components:inputs/kafka_migrator_bundle.adoc[] *** xref:components:inputs/mongodb.adoc[] *** xref:components:inputs/mqtt.adoc[] *** xref:components:inputs/nanomsg.adoc[] @@ -203,6 +205,9 @@ *** xref:components:outputs/inproc.adoc[] *** xref:components:outputs/kafka.adoc[] *** xref:components:outputs/kafka_franz.adoc[] +*** xref:components:outputs/kafka_migrator.adoc[] +*** xref:components:outputs/kafka_migrator_bundle.adoc[] +*** xref:components:outputs/kafka_migrator_offsets.adoc[] *** xref:components:outputs/mongodb.adoc[] *** xref:components:outputs/mqtt.adoc[] *** xref:components:outputs/nanomsg.adoc[] diff --git a/modules/components/pages/inputs/kafka_migrator.adoc b/modules/components/pages/inputs/kafka_migrator.adoc new file mode 100644 index 00000000..460f8ef8 --- /dev/null +++ b/modules/components/pages/inputs/kafka_migrator.adoc @@ -0,0 +1,708 @@ += kafka_migrator +// tag::single-source[] +:type: input +:status: beta +:categories: ["Services"] + +// © 2024 Redpanda Data Inc. + + +component_type_dropdown::[] + +Use this connector in conjunction with the xref:components:outputs/kafka_migrator.adoc[`kafka_migrator` output] to migrate topics between Kafka brokers. The `kafka_migrator` input uses the https://github.com/twmb/franz-go[Franz Kafka client library^]. + + +ifndef::env-cloud[] +Introduced in version 4.35.0. +endif::[] + +[tabs] +====== +Common:: ++ +-- + +```yml +# Common configuration fields, showing default values +input: + label: "" + kafka_migrator: + seed_brokers: [] # No default (required) + topics: [] # No default (required) + regexp_topics: false + consumer_group: "" # No default (optional) + auto_replay_nacks: true +``` + +-- +Advanced:: ++ +-- + +```yml +# All configuration fields, showing default values +input: + label: "" + kafka_migrator: + seed_brokers: [] # No default (required) + topics: [] # No default (required) + regexp_topics: false + consumer_group: "" # No default (optional) + client_id: benthos + rack_id: "" + batch_size: 1024 + auto_replay_nacks: true + commit_period: 5s + start_from_oldest: true + tls: + enabled: false + skip_cert_verify: false + enable_renegotiation: false + root_cas: "" + root_cas_file: "" + client_certs: [] + sasl: [] # No default (optional) + multi_header: false + batching: + count: 0 + byte_size: 0 + period: "" + check: "" + processors: [] # No default (optional) + topic_lag_refresh_period: 5s + output_resource: kafka_migrator_output +``` + +-- +====== + +The `kafka_migrator` input: + +* Reads a batch of messages from a Kafka broker. +* Attempts to create all selected topics along with their associated ACLs in the broker that the output points to, identified by the label specified in `output_resource`. +* Waits for the `kafka_migrator` output to acknowledge the writes before updating the Kafka consumer group offset. + +Specify a consumer group for this input to consume one or more topics and automatically balance the topic partitions across any other connected clients with the same consumer group. Otherwise, topics are consumed in their entirety or with explicit partitions. + +== Metrics + +This input emits a `input_kafka_migrator_lag` metric with `topic` and `partition` labels for each consumed topic. + +== Metadata + +This input adds the following metadata fields to each message: + +```text +- kafka_key +- kafka_topic +- kafka_partition +- kafka_offset +- kafka_lag +- kafka_timestamp_unix +- kafka_tombstone_message +- All record headers +``` + + +== Fields + +=== `seed_brokers` + +A list of broker addresses to connect to in order. Use commas to separate multiple addresses in a single list item. + +*Type*: `array` + + +```yml +# Examples + +seed_brokers: + - localhost:9092 + +seed_brokers: + - foo:9092 + - bar:9092 + +seed_brokers: + - foo:9092,bar:9092 +``` + +=== `topics` + +A list of topics to consume from. Use commas to separated multiple topics in a single element. + +When a `consumer_group` is specified, partitions are automatically distributed across consumers of a topic. Otherwise, all partitions are consumed. + +Alternatively, you can specify explicit partitions to consume by using a colon after the topic name. For example, `foo:0` would consume the partition `0` of the topic foo. This syntax supports ranges. For example, `foo:0-10` would consume partitions `0` through to `10` inclusive. + +You can also possible to specify an explicit offset to consume from by adding another colon after the partition. For example, `foo:0:10` would consume the partition `0` of the topic `foo` starting from the offset `10`. If the offset is not present (or remains unspecified) then the field `start_from_oldest` determines which offset to start from. + + +*Type*: `array` + + +```yml +# Examples + +topics: + - foo + - bar + +topics: + - things.* + +topics: + - foo,bar + +topics: + - foo:0 + - bar:1 + - bar:3 + +topics: + - foo:0,bar:1,bar:3 + +topics: + - foo:0-5 +``` + +=== `regexp_topics` + +Whether listed topics are interpreted as regular expression patterns for matching multiple topics. When topics are specified with explicit partitions, this field must remain set to `false`. + + +*Type*: `bool` + +*Default*: `false` + +=== `consumer_group` + +An optional consumer group. When specified, the partitions of specified topics are automatically distributed across consumers sharing a consumer group, and partition offsets are automatically committed and resumed under this name. Consumer groups are not supported when explicit partitions are specified to consume from in the `topics` field. + + +*Type*: `string` + + +=== `client_id` + +An identifier for the client connection. + + +*Type*: `string` + +*Default*: `"benthos"` + +=== `rack_id` + +A rack identifier for this client. + + +*Type*: `string` + +*Default*: `""` + +=== `batch_size` + +The maximum number of messages that can accumulate in each batch. + + +*Type*: `int` + +*Default*: `1024` + +=== `auto_replay_nacks` + +Whether to automatically replay messages that are rejected (nacked) at the output level. If the cause of rejections is persistent, leaving this option enabled can result in back pressure. + +Set `auto_replay_nacks` to `false` to delete rejected messages. Disabling auto replays can greatly improve memory efficiency of high throughput streams as the original shape of the data is discarded immediately upon consumption and mutation. + + +*Type*: `bool` + +*Default*: `true` + +=== `commit_period` + +The period of time between each commit of the current partition offsets. Offsets are always committed during shutdown. + + +*Type*: `string` + +*Default*: `"5s"` + +=== `start_from_oldest` + +Determines whether to consume from the oldest available offset, otherwise messages are consumed from the latest offset. This setting is applied when creating a new consumer group or the saved offset no longer exists. + + +*Type*: `bool` + +*Default*: `true` + +=== `tls` + +Override system defaults with custom TLS settings. + + +*Type*: `object` + + +=== `tls.enabled` + +Whether custom TLS settings are enabled. + + +*Type*: `bool` + +*Default*: `false` + +=== `tls.skip_cert_verify` + +Whether to skip server-side certificate verification. + + +*Type*: `bool` + +*Default*: `false` + +=== `tls.enable_renegotiation` + +Whether to allow the remote server to repeatedly request renegotiation. Enable this option if you're seeing the error message `local error: tls: no renegotiation`. + + +*Type*: `bool` + +*Default*: `false` + +ifndef::env-cloud[] +Requires version 3.45.0 or newer +endif::[] + +=== `tls.root_cas` + +Specify a certificate authority to use (optional). This is a string that represents a certificate chain from the parent trusted root certificate, through possible intermediate signing certificates, to the host certificate. + +include::components:partial$secret_warning.adoc[] + +*Type*: `string` + +*Default*: `""` + +```yml +# Examples + +root_cas: |- + -----BEGIN CERTIFICATE----- + ... + -----END CERTIFICATE----- +``` + +=== `tls.root_cas_file` + +Specify the path to a root certificate authority file (optional). This is a file, often with a `.pem` extension, which contains a certificate chain from the parent trusted root certificate, through possible intermediate signing certificates, to the host certificate. + + +*Type*: `string` + +*Default*: `""` + +```yml +# Examples + +root_cas_file: ./root_cas.pem +``` + +=== `tls.client_certs` + +A list of client certificates to use. For each certificate specify values for either the `cert` and `key` fields, or `cert_file` and `key_file` fields. + + +*Type*: `array` + +*Default*: `[]` + +```yml +# Examples + +client_certs: + - cert: foo + key: bar + +client_certs: + - cert_file: ./example.pem + key_file: ./example.key +``` + +=== `tls.client_certs[].cert` + +A plain text certificate to use. + + +*Type*: `string` + +*Default*: `""` + +=== `tls.client_certs[].key` + +A plain text certificate key to use. + +include::components:partial$secret_warning.adoc[] + +*Type*: `string` + +*Default*: `""` + +=== `tls.client_certs[].cert_file` + +The path of a certificate to use. + + +*Type*: `string` + +*Default*: `""` + +=== `tls.client_certs[].key_file` + +The path of a certificate key to use. + + +*Type*: `string` + +*Default*: `""` + +=== `tls.client_certs[].password` + +A plain text password for when the private key is password encrypted in PKCS#1 or PKCS#8 format. The obsolete `pbeWithMD5AndDES-CBC` algorithm is not supported for the PKCS#8 format. + +Because the obsolete `pbeWithMD5AndDES-CBC` algorithm does not authenticate the ciphertext, it is vulnerable to padding Oracle attacks that can let an attacker recover the plaintext. + + +include::components:partial$secret_warning.adoc[] + + + +*Type*: `string` + +*Default*: `""` + +```yml +# Examples + +password: foo + +password: ${KEY_PASSWORD} +``` + +=== `sasl` + +Specify one or more methods of SASL authentication. SASL mechanisms are tried in order. If the broker supports the first mechanism, all connections use that mechanism. If the first mechanism fails, the client picks the first supported mechanism. Connections fail if the broker does not support any client mechanisms. + + +*Type*: `array` + + +```yml +# Examples + +sasl: + - mechanism: SCRAM-SHA-512 + password: bar + username: foo +``` + +=== `sasl[].mechanism` + +The SASL mechanism to use. + + +*Type*: `string` + + +|=== +| Option | Summary + +| `AWS_MSK_IAM` +| AWS IAM based authentication as specified by the `aws-msk-iam-auth` java library. +| `OAUTHBEARER` +| OAuth Bearer based authentication. +| `PLAIN` +| Plain text authentication. +| `SCRAM-SHA-256` +| SCRAM based authentication as specified in RFC5802. +| `SCRAM-SHA-512` +| SCRAM based authentication as specified in RFC5802. +| `none` +| Disable sasl authentication + +|=== + +=== `sasl[].username` + +A username to provide for PLAIN or SCRAM-* authentication. + + +*Type*: `string` + +*Default*: `""` + +=== `sasl[].password` + +A password to provide for PLAIN or SCRAM-* authentication. + +include::components:partial$secret_warning.adoc[] + + + +*Type*: `string` + +*Default*: `""` + +=== `sasl[].token` + +The token to use for a single session's OAUTHBEARER authentication. + + +*Type*: `string` + +*Default*: `""` + +=== `sasl[].extensions` + +Key/value pairs to add to OAUTHBEARER authentication requests. + + +*Type*: `object` + + +=== `sasl[].aws` + +Contains AWS specific fields for when the `mechanism` is set to `AWS_MSK_IAM`. + + +*Type*: `object` + + +=== `sasl[].aws.region` + +The AWS region to target. + + +*Type*: `string` + +*Default*: `""` + +=== `sasl[].aws.endpoint` + +Specify a custom endpoint for the AWS API. + + +*Type*: `string` + +*Default*: `""` + +=== `sasl[].aws.credentials` + +Manually configure the AWS credentials to use (optional). For more information, see xref:guides:cloud/aws.adoc[]. + + +*Type*: `object` + + +=== `sasl[].aws.credentials.profile` + +A profile from `~/.aws/credentials` to use. + + +*Type*: `string` + +*Default*: `""` + +=== `sasl[].aws.credentials.id` + +The ID of credentials to use. + + +*Type*: `string` + +*Default*: `""` + +=== `sasl[].aws.credentials.secret` + +The secret for the credentials being used. + +include::components:partial$secret_warning.adoc[] + +*Type*: `string` + +*Default*: `""` + +=== `sasl[].aws.credentials.token` + +The token for the credentials being used. This is required when using short term credentials. + + +*Type*: `string` + +*Default*: `""` + +=== `sasl[].aws.credentials.from_ec2_role` + +Use the credentials of a host EC2 machine configured to assume https://docs.aws.amazon.com/IAM/latest/UserGuide/id_roles_use_switch-role-ec2.html[an IAM role associated with the instance^]. + + +*Type*: `bool` + +*Default*: `false` + +ifndef::env-cloud[] +Requires version 4.2.0 or newer +endif::[] + +=== `sasl[].aws.credentials.role` + +A role ARN to assume. + + +*Type*: `string` + +*Default*: `""` + +=== `sasl[].aws.credentials.role_external_id` + +An external ID to provide when assuming a role. + + +*Type*: `string` + +*Default*: `""` + +=== `multi_header` + +Decode headers into lists to allow handling of multiple values with the same key. + + +*Type*: `bool` + +*Default*: `false` + +=== `batching` + +Configure a xref:configuration:batching.adoc[batching policy] that applies to individual topic partitions in order to batch messages together before flushing them for processing. Batching can be beneficial for performance as well as useful for windowed processing, and doing so this way preserves the ordering of topic partitions. + + +*Type*: `object` + + +```yml +# Examples + +batching: + byte_size: 5000 + count: 0 + period: 1s + +batching: + count: 10 + period: 1s + +batching: + check: this.contains("END BATCH") + count: 0 + period: 1m +``` + +=== `batching.count` + +The number of messages after which the batch is flushed. Set to `0` to disable count-based batching. + + +*Type*: `int` + +*Default*: `0` + +=== `batching.byte_size` + +The amount of bytes at which the batch is flushed. Set to `0` to disable size-based batching. + + +*Type*: `int` + +*Default*: `0` + +=== `batching.period` + +The period after which an incomplete batch is flushed regardless of its size. + + +*Type*: `string` + +*Default*: `""` + +```yml +# Examples + +period: 1s + +period: 1m + +period: 500ms +``` + +=== `batching.check` + +A xref:guides:bloblang/about.adoc[Bloblang query] that should return a boolean value indicating whether a message should end a batch. + + +*Type*: `string` + +*Default*: `""` + +```yml +# Examples + +check: this.type == "end_of_transaction" +``` + +=== `batching.processors` + +A list of xref:components:processors/about.adoc[processors] to apply to a batch as it is flushed. This allows you to aggregate and archive the batch. All resulting messages are flushed as a single batch. Splitting the batch into smaller batches using these processors is a no-op. + + +*Type*: `array` + + +```yml +# Examples + +processors: + - archive: + format: concatenate + +processors: + - archive: + format: lines + +processors: + - archive: + format: json_array +``` + +=== `topic_lag_refresh_period` + +The period of time between each topic lag refresh cycle. + + +*Type*: `string` + +*Default*: `"5s"` + +=== `output_resource` + +The label of the `kafka_migrator` output in which the currently selected topics need to be created before attempting to read messages. + + +*Type*: `string` + +*Default*: `"kafka_migrator_output"` + +// end::single-source[] \ No newline at end of file diff --git a/modules/components/pages/inputs/kafka_migrator_bundle.adoc b/modules/components/pages/inputs/kafka_migrator_bundle.adoc new file mode 100644 index 00000000..fa5a6150 --- /dev/null +++ b/modules/components/pages/inputs/kafka_migrator_bundle.adoc @@ -0,0 +1,53 @@ += kafka_migrator_bundle +// tag::single-source[] +:type: input +:status: experimental +:categories: ["Services"] + +// © 2024 Redpanda Data Inc. + + +component_type_dropdown::[] + + +The `kafka_migrator_bundle` input reads messages and schemas from a Kafka or Redpanda cluster. Use this input in conjunction with the xref:components:outputs/kafka_migrator_bundle.adoc[`kafka_migrator_bundle` output]. + +```yml +# Config fields, showing default values +input: + label: "" + kafka_migrator_bundle: + kafka_migrator: {} # No default (required) + schema_registry: {} # No default (required) + migrate_schemas_before_data: true +``` + + +== Fields + +=== `kafka_migrator` + +The xref:components:inputs/kafka_migrator.adoc[`kafka_migrator` input] configuration. + + +*Type*: `object` + + +=== `schema_registry` + +The xref:components:inputs/schema_registry.adoc[`schema_registry`input] configuration. + + +*Type*: `object` + + +=== `migrate_schemas_before_data` + +Migrate all schemas first before starting to migrate data. + + +*Type*: `bool` + +*Default*: `true` + +// end::single-source[] \ No newline at end of file diff --git a/modules/components/pages/outputs/kafka_migrator.adoc b/modules/components/pages/outputs/kafka_migrator.adoc new file mode 100644 index 00000000..783667a2 --- /dev/null +++ b/modules/components/pages/outputs/kafka_migrator.adoc @@ -0,0 +1,818 @@ += kafka_migrator +// tag::single-source[] +:type: output +:status: beta +:categories: ["Services"] + +// © 2024 Redpanda Data Inc. + + +component_type_dropdown::[] + +Writes a batch of messages to a Kafka broker and waits for acknowledgement before propagating them back to the input. + +Use this connector in conjunction with the xref:components:inputs/kakfa_migrator[`kafka_migrator` input] to migrate topics between Kafka brokers. The `kafka_migrator` output uses the the https://github.com/twmb/franz-go[Franz Kafka client library^]. + +ifndef::env-cloud[] +Introduced in version 4.35.0. +endif::[] + +[tabs] +====== +Common:: ++ +-- + +```yml +# Common config fields, showing default values +output: + label: "" + kafka_migrator: + seed_brokers: [] # No default (required) + topic: "" # No default (required) + key: "" # No default (optional) + partition: ${! meta("partition") } # No default (optional) + metadata: + include_prefixes: [] + include_patterns: [] + max_in_flight: 10 + batching: + count: 0 + byte_size: 0 + period: "" + check: "" +``` + +-- +Advanced:: ++ +-- + +```yml +# All config fields, showing default values +output: + label: "" + kafka_migrator: + seed_brokers: [] # No default (required) + topic: "" # No default (required) + key: "" # No default (optional) + partitioner: "" # No default (optional) + partition: ${! meta("partition") } # No default (optional) + client_id: benthos + rack_id: "" + idempotent_write: true + metadata: + include_prefixes: [] + include_patterns: [] + max_in_flight: 10 + timeout: 10s + batching: + count: 0 + byte_size: 0 + period: "" + check: "" + processors: [] # No default (optional) + max_message_bytes: 1MB + broker_write_max_bytes: 100MB + compression: "" # No default (optional) + tls: + enabled: false + skip_cert_verify: false + enable_renegotiation: false + root_cas: "" + root_cas_file: "" + client_certs: [] + sasl: [] # No default (optional) + timestamp: ${! timestamp_unix() } # No default (optional) + input_resource: kafka_migrator_input +``` + +-- +====== + +This output can query the `kafka_migrator` input for topic and ACL configurations. + +If the configured broker does not contain the current message topic, this output attempts to create it along with the topic +ACLs, which are automatically read from the `kafka_migrator` input, identified by the label specified in +`input_resource`. + + +== Examples + +[tabs] +====== +Transfer data:: ++ +-- + +Writes messages to the configured broker and creates topics and topic ACLs if they don't exist. It also ensures that the message order is preserved. + +```yaml +output: + kafka_migrator: + seed_brokers: [ "127.0.0.1:9093" ] + topic: ${! metadata("kafka_topic").or(throw("missing kafka_topic metadata")) } + key: ${! metadata("kafka_key") } + partitioner: manual + partition: ${! metadata("kafka_partition").or(throw("missing kafka_partition metadata")) } + timestamp: ${! metadata("kafka_timestamp_unix").or(timestamp_unix()) } + input_resource: kafka_migrator_input + max_in_flight: 1 +``` + +-- +====== + +== Fields + +=== `seed_brokers` + +A list of broker addresses to connect to. Use commas to separate multiple addresses in a single list item. + + +*Type*: `array` + + +```yml +# Examples + +seed_brokers: + - localhost:9092 + +seed_brokers: + - foo:9092 + - bar:9092 + +seed_brokers: + - foo:9092,bar:9092 +``` + +=== `topic` + +A topic to write messages to. + +This field supports xref:configuration:interpolation.adoc#bloblang-queries[interpolation functions]. + + +*Type*: `string` + + +=== `key` + +An optional key to populate for each message. + +This field supports xref:configuration:interpolation.adoc#bloblang-queries[interpolation functions]. + + +*Type*: `string` + + +=== `partitioner` + +Override the default murmur2 hashing partitioner. + + +*Type*: `string` + + +|=== +| Option | Summary + +| `least_backup` +| Chooses the least backed up partition (the partition with the fewest amount of buffered records). Partitions are selected per batch. +| `manual` +| Manually select a partition for each message. You must also specify a value for the `partition` field. +| `murmur2_hash` +| Kafka's default hash algorithm that uses a 32-bit murmur2 hash of the key to compute the partition for the record. +| `round_robin` +| Round-robin's messages through all available partitions. This algorithm has lower throughput and causes higher CPU load on brokers, but is useful if you want to ensure an even distribution of records to partitions. + +|=== + +=== `partition` + +An optional explicit partition to set for each message. This field is only relevant when the `partitioner` is set to `manual`. The provided interpolation string must be a valid integer. + +This field supports xref:configuration:interpolation.adoc#bloblang-queries[interpolation functions]. + + +*Type*: `string` + + +```yml +# Examples + +partition: ${! meta("partition") } +``` + +=== `client_id` + +An identifier for the client connection. + + +*Type*: `string` + +*Default*: `"benthos"` + +=== `rack_id` + +A rack identifier for this client. + + +*Type*: `string` + +*Default*: `""` + +=== `idempotent_write` + +Enable the idempotent write producer option. This requires the `IDEMPOTENT_WRITE` permission on `CLUSTER`. Disable this option if the `IDEMPOTENT_WRITE` permission is not available. + + +*Type*: `bool` + +*Default*: `true` + +=== `metadata` + +Determine which (if any) metadata values are added to messages as headers. + + +*Type*: `object` + + +=== `metadata.include_prefixes` + +Provide a list of explicit metadata key prefixes to match against. + + +*Type*: `array` + +*Default*: `[]` + +```yml +# Examples + +include_prefixes: + - foo_ + - bar_ + +include_prefixes: + - kafka_ + +include_prefixes: + - content- +``` + +=== `metadata.include_patterns` + +Provide a list of explicit metadata key regular expression (re2) patterns to match against. + + +*Type*: `array` + +*Default*: `[]` + +```yml +# Examples + +include_patterns: + - .* + +include_patterns: + - _timestamp_unix$ +``` + +=== `max_in_flight` + +The maximum number of batches to send in parallel at any given time. + + +*Type*: `int` + +*Default*: `10` + +=== `timeout` + +The maximum period of time to wait for message sends before abandoning the request and retrying + + +*Type*: `string` + +*Default*: `"10s"` + +=== `batching` + +Configure a xref:configuration:batching.adoc[batching policy]. + + +*Type*: `object` + + +```yml +# Examples + +batching: + byte_size: 5000 + count: 0 + period: 1s + +batching: + count: 10 + period: 1s + +batching: + check: this.contains("END BATCH") + count: 0 + period: 1m +``` + +=== `batching.count` + +The number of messages after which the batch is flushed. Set to `0` to disable count-based batching. + +*Type*: `int` + +*Default*: `0` + +=== `batching.byte_size` + +The amount of bytes at which the batch is flushed. Set to `0` to disable size-based batching. + + +*Type*: `int` + +*Default*: `0` + +=== `batching.period` + +The period after which an incomplete batch is flushed regardless of its size. + +*Type*: `string` + +*Default*: `""` + +```yml +# Examples + +period: 1s + +period: 1m + +period: 500ms +``` + +=== `batching.check` + +A xref:guides:bloblang/about.adoc[Bloblang query] that should return a boolean value indicating whether a message should end a batch. + + +*Type*: `string` + +*Default*: `""` + +```yml +# Examples + +check: this.type == "end_of_transaction" +``` + +=== `batching.processors` + +A list of xref:components:processors/about.adoc[processors] to apply to a batch as it is flushed. This allows you to aggregate and archive the batch. All resulting messages are flushed as a single batch. Splitting the batch into smaller batches using these processors is a no-op. + +*Type*: `array` + + +```yml +# Examples + +processors: + - archive: + format: concatenate + +processors: + - archive: + format: lines + +processors: + - archive: + format: json_array +``` + +=== `max_message_bytes` + +The maximum space in bytes that an individual message may use. Messages larger than this value are rejected. This field corresponds to Kafka's `max.message.bytes`. + + +*Type*: `string` + +*Default*: `"1MB"` + +```yml +# Examples + +max_message_bytes: 100MB + +max_message_bytes: 50mib +``` + +=== `broker_write_max_bytes` + +The upper bound for the number of bytes written to a broker connection in a single write. This field corresponds to Kafka's `socket.request.max.bytes`. + + +*Type*: `string` + +*Default*: `"100MB"` + +```yml +# Examples + +broker_write_max_bytes: 128MB + +broker_write_max_bytes: 50mib +``` + +=== `compression` + +Set an explicit compression type (optional). The default preference is to use `snappy` when the broker supports it. Otherwise, use `none`. + + +*Type*: `string` + + +Options: +`lz4` +, `snappy` +, `gzip` +, `none` +, `zstd` +. + +=== `tls` + +Override system defaults with custom TLS settings. + + +*Type*: `object` + + +=== `tls.enabled` + +Whether custom TLS settings are enabled. + + +*Type*: `bool` + +*Default*: `false` + +=== `tls.skip_cert_verify` + +Whether to skip server-side certificate verification. + + +*Type*: `bool` + +*Default*: `false` + +=== `tls.enable_renegotiation` + +Whether to allow the remote server to repeatedly request renegotiation. Enable this option if you're seeing the error message `local error: tls: no renegotiation`. + + +*Type*: `bool` + +*Default*: `false` + +ifndef::env-cloud[] +Requires version 3.45.0 or newer +endif::[] + +=== `tls.root_cas` + +Specify a certificate authority to use (optional). This is a string that represents a certificate chain from the parent trusted root certificate, through possible intermediate signing certificates, to the host certificate. + +include::components:partial$secret_warning.adoc[] + + + +*Type*: `string` + +*Default*: `""` + +```yml +# Examples + +root_cas: |- + -----BEGIN CERTIFICATE----- + ... + -----END CERTIFICATE----- +``` + +=== `tls.root_cas_file` + +Specify the path to a root certificate authority file (optional). This is a file, often with a `.pem` extension, which contains a certificate chain from the parent trusted root certificate, through possible intermediate signing certificates, to the host certificate. + + +*Type*: `string` + +*Default*: `""` + +```yml +# Examples + +root_cas_file: ./root_cas.pem +``` + +=== `tls.client_certs` + +A list of client certificates to use. For each certificate specify values for either the `cert` and `key` fields, or `cert_file` and `key_file` fields. + + + +*Type*: `array` + +*Default*: `[]` + +```yml +# Examples + +client_certs: + - cert: foo + key: bar + +client_certs: + - cert_file: ./example.pem + key_file: ./example.key +``` + +=== `tls.client_certs[].cert` + +A plain text certificate to use. + + +*Type*: `string` + +*Default*: `""` + +=== `tls.client_certs[].key` + +A plain text certificate key to use. + +include::components:partial$secret_warning.adoc[] + + +*Type*: `string` + +*Default*: `""` + +=== `tls.client_certs[].cert_file` + +The path of a certificate to use. + + +*Type*: `string` + +*Default*: `""` + +=== `tls.client_certs[].key_file` + +The path of a certificate key to use. + + +*Type*: `string` + +*Default*: `""` + +=== `tls.client_certs[].password` + +A plain text password for when the private key is password encrypted in PKCS#1 or PKCS#8 format. The obsolete `pbeWithMD5AndDES-CBC` algorithm is not supported for the PKCS#8 format. + +Because the obsolete `pbeWithMD5AndDES-CBC` algorithm does not authenticate the ciphertext, it is vulnerable to padding Oracle attacks that can let an attacker recover the plaintext. + +include::components:partial$secret_warning.adoc[] + + +*Type*: `string` + +*Default*: `""` + +```yml +# Examples + +password: foo + +password: ${KEY_PASSWORD} +``` + +=== `sasl` + +Specify one or more methods of SASL authentication. SASL mechanisms are tried in order. If the broker supports the first mechanism, all connections will use that mechanism. If the first mechanism fails, the client picks the first supported mechanism. Connections fail if the broker does not support any client mechanisms. + + +*Type*: `array` + + +```yml +# Examples + +sasl: + - mechanism: SCRAM-SHA-512 + password: bar + username: foo +``` + +=== `sasl[].mechanism` + +The SASL mechanism to use. + + +*Type*: `string` + + +|=== +| Option | Summary + +| `AWS_MSK_IAM` +| AWS IAM based authentication as specified by the `aws-msk-iam-auth` java library. +| `OAUTHBEARER` +| OAuth Bearer based authentication. +| `PLAIN` +| Plain text authentication. +| `SCRAM-SHA-256` +| SCRAM-based authentication as specified in RFC5802. +| `SCRAM-SHA-512` +| SCRAM-based authentication as specified in RFC5802. +| `none` +| Disable sasl authentication + +|=== + +=== `sasl[].username` + +A username to provide for PLAIN or SCRAM-* authentication. + + +*Type*: `string` + +*Default*: `""` + +=== `sasl[].password` + +A password to provide for PLAIN or SCRAM-* authentication. + +include::components:partial$secret_warning.adoc[] + + +*Type*: `string` + +*Default*: `""` + +=== `sasl[].token` + +The token to use for a single session's OAUTHBEARER authentication. + + +*Type*: `string` + +*Default*: `""` + +=== `sasl[].extensions` + +Key/value pairs to add to OAUTHBEARER authentication requests. + + +*Type*: `object` + + +=== `sasl[].aws` + +Contains AWS specific fields for when the `mechanism` is set to `AWS_MSK_IAM`. + + +*Type*: `object` + + +=== `sasl[].aws.region` + +The AWS region to target. + + +*Type*: `string` + +*Default*: `""` + +=== `sasl[].aws.endpoint` + +Specify a custom endpoint for the AWS API. + + +*Type*: `string` + +*Default*: `""` + +=== `sasl[].aws.credentials` + +Manually configure the AWS credentials to use (optional). For more information, see xref:guides:cloud/aws.adoc[]. + + +*Type*: `object` + + +=== `sasl[].aws.credentials.profile` + +A profile from `~/.aws/credentials` to use. + + +*Type*: `string` + +*Default*: `""` + +=== `sasl[].aws.credentials.id` + +The ID of credentials to use. + + +*Type*: `string` + +*Default*: `""` + +=== `sasl[].aws.credentials.secret` + +The secret for the credentials being used. + +include::components:partial$secret_warning.adoc[] + + + +*Type*: `string` + +*Default*: `""` + +=== `sasl[].aws.credentials.token` + +The token for the credentials being used. The token is required when using short term credentials. + + +*Type*: `string` + +*Default*: `""` + +=== `sasl[].aws.credentials.from_ec2_role` + +Use the credentials of a host EC2 machine configured to assume https://docs.aws.amazon.com/IAM/latest/UserGuide/id_roles_use_switch-role-ec2.html[an IAM role associated with the instance^]. + + +*Type*: `bool` + +*Default*: `false` + +ifndef::env-cloud[] +Requires version 4.2.0 or newer +endif::[] + +=== `sasl[].aws.credentials.role` + +A role ARN to assume. + + +*Type*: `string` + +*Default*: `""` + +=== `sasl[].aws.credentials.role_external_id` + +An external ID to provide when assuming a role. + + +*Type*: `string` + +*Default*: `""` + +=== `timestamp` + +An optional timestamp to set for each message. When left empty, the current timestamp is used. +This field supports xref:configuration:interpolation.adoc#bloblang-queries[interpolation functions]. + + +*Type*: `string` + + +```yml +# Examples + +timestamp: ${! timestamp_unix() } + +timestamp: ${! metadata("kafka_timestamp_unix") } +``` + +=== `input_resource` + +The label of the `kafka_migrator` input from which to read the configurations of topics and ACLs for creation. + + +*Type*: `string` + +*Default*: `"kafka_migrator_input"` + +// end::single-source[] \ No newline at end of file diff --git a/modules/components/pages/outputs/kafka_migrator_bundle.adoc b/modules/components/pages/outputs/kafka_migrator_bundle.adoc new file mode 100644 index 00000000..7bc94801 --- /dev/null +++ b/modules/components/pages/outputs/kafka_migrator_bundle.adoc @@ -0,0 +1,41 @@ += kafka_migrator_bundle +// tag::single-source[] +:type: output +:status: experimental +:categories: ["Services"] + + +// © 2024 Redpanda Data Inc. + + +component_type_dropdown::[] + +Writes messages and schemas to a Kafka or Redpanda cluster. Use this output in conjunction with the `kafka_migrator_bundle` input. + + +```yml +# Config fields, showing default values +output: + label: "" + kafka_migrator_bundle: + kafka_migrator: {} # No default (required) + schema_registry: {} # No default (required) +``` + +== Fields + +=== `kafka_migrator` + +The xref:components:outputs/kafka_migrator.adoc[`kafka_migrator` output] configuration. + +*Type*: `object` + + +=== `schema_registry` + +The xref:components:outputs/schema_registry.adoc[`schema_registry` output] configuration. The `subject` field must be left empty. + + +*Type*: `object` + +// end::single-source[] \ No newline at end of file diff --git a/modules/components/pages/outputs/kafka_migrator_offsets.adoc b/modules/components/pages/outputs/kafka_migrator_offsets.adoc new file mode 100644 index 00000000..5d9c6f2f --- /dev/null +++ b/modules/components/pages/outputs/kafka_migrator_offsets.adoc @@ -0,0 +1,552 @@ += kafka_migrator_offsets +// tag::single-source[] +:type: output +:status: beta +:categories: ["Services"] + + +// © 2024 Redpanda Data Inc. + + +component_type_dropdown::[] + + +Use the `kafka_migrator_offsets` output in conjunction with the `kafka_franz` input that is configured to read the `__consumer_offsets` topic. +This output uses the https://github.com/twmb/franz-go[Franz Kafka client library^]. + + +ifndef::env-cloud[] +Introduced in version 4.35.0. +endif::[] + +[tabs] +====== +Common:: ++ +-- + +```yml +# Common config fields, showing default values +output: + label: "" + kafka_migrator_offsets: + seed_brokers: [] # No default (required) + kafka_key: ${! @kafka_key } + max_in_flight: 1 +``` + +-- +Advanced:: ++ +-- + +```yml +# All config fields, showing default values +output: + label: "" + kafka_migrator_offsets: + seed_brokers: [] # No default (required) + kafka_key: ${! @kafka_key } + client_id: benthos + max_in_flight: 1 + timeout: 10s + max_message_bytes: 1MB + broker_write_max_bytes: 100MB + tls: + enabled: false + skip_cert_verify: false + enable_renegotiation: false + root_cas: "" + root_cas_file: "" + client_certs: [] + sasl: [] # No default (optional) + max_retries: 0 + backoff: + initial_interval: 1s + max_interval: 5s + max_elapsed_time: 30s +``` + +-- +====== + +== Fields + +=== `seed_brokers` + +A list of broker addresses to connect to. Use commas to separate multiple addresses in a single list item. + + +*Type*: `array` + + +```yml +# Examples + +seed_brokers: + - localhost:9092 + +seed_brokers: + - foo:9092 + - bar:9092 + +seed_brokers: + - foo:9092,bar:9092 +``` + +=== `kafka_key` + +Kafka key. + +This field supports xref:configuration:interpolation.adoc#bloblang-queries[interpolation functions]. + + +*Type*: `string` + +*Default*: `"${! @kafka_key }"` + +=== `client_id` + +An identifier for the client connection. + + +*Type*: `string` + +*Default*: `"benthos"` + +=== `max_in_flight` + +The maximum number of batches to send parallel at any given time. + + +*Type*: `int` + +*Default*: `1` + +=== `timeout` + +The maximum period of time to wait for message sends before abandoning the request and retrying + + +*Type*: `string` + +*Default*: `"10s"` + +=== `max_message_bytes` + +The maximum space in bytes that an individual message may use. Messages larger than this value are rejected. This field corresponds to Kafka's `max.message.bytes`. + + +*Type*: `string` + +*Default*: `"1MB"` + +```yml +# Examples + +max_message_bytes: 100MB + +max_message_bytes: 50mib +``` + +=== `broker_write_max_bytes` + +The upper bound for the number of bytes written to a broker connection in a single write. This field corresponds to Kafka's `socket.request.max.bytes`. + + +*Type*: `string` + +*Default*: `"100MB"` + +```yml +# Examples + +broker_write_max_bytes: 128MB + +broker_write_max_bytes: 50mib +``` + +=== `tls` + +Override system defaults with custom TLS settings. + + +*Type*: `object` + + +=== `tls.enabled` + +Whether custom TLS settings are enabled. + + +*Type*: `bool` + +*Default*: `false` + +=== `tls.skip_cert_verify` + +Whether to skip server-side certificate verification. + + +*Type*: `bool` + +*Default*: `false` + +=== `tls.enable_renegotiation` + +Whether to allow the remote server to repeatedly request renegotiation. Enable this option if you're seeing the error message `local error: tls: no renegotiation`. + + +*Type*: `bool` + +*Default*: `false` + +ifndef::env-cloud[] +Requires version 3.45.0 or newer +endif::[] + +=== `tls.root_cas` + +Specify a certificate authority to use (optional). This is a string that represents a certificate chain from the parent trusted root certificate, through possible intermediate signing certificates, to the host certificate. + +include::components:partial$secret_warning.adoc[] + +*Type*: `string` + +*Default*: `""` + +```yml +# Examples + +root_cas: |- + -----BEGIN CERTIFICATE----- + ... + -----END CERTIFICATE----- +``` + +=== `tls.root_cas_file` + +Specify the path to a root certificate authority file (optional). This is a file, often with a `.pem` extension, which contains a certificate chain from the parent trusted root certificate, through possible intermediate signing certificates, to the host certificate. + + +*Type*: `string` + +*Default*: `""` + +```yml +# Examples + +root_cas_file: ./root_cas.pem +``` + +=== `tls.client_certs` + +A list of client certificates to use. For each certificate, specify values for either the `cert` and `key`, or `cert_file` and `key_file` fields. + + +*Type*: `array` + +*Default*: `[]` + +```yml +# Examples + +client_certs: + - cert: foo + key: bar + +client_certs: + - cert_file: ./example.pem + key_file: ./example.key +``` + +=== `tls.client_certs[].cert` + +A plain text certificate to use. + + +*Type*: `string` + +*Default*: `""` + +=== `tls.client_certs[].key` + +A plain text certificate key to use. + +include::components:partial$secret_warning.adoc[] + + +*Type*: `string` + +*Default*: `""` + +=== `tls.client_certs[].cert_file` + +The path of a certificate to use. + + +*Type*: `string` + +*Default*: `""` + +=== `tls.client_certs[].key_file` + +The path of a certificate key to use. + + +*Type*: `string` + +*Default*: `""` + +=== `tls.client_certs[].password` + +A plain text password for when the private key is password encrypted in PKCS#1 or PKCS#8 format. The obsolete `pbeWithMD5AndDES-CBC` algorithm is not supported for the PKCS#8 format. + +Because the obsolete `pbeWithMD5AndDES-CBC` algorithm does not authenticate the ciphertext, it is vulnerable to padding oracle attacks that can let an attacker recover the plaintext. + +include::components:partial$secret_warning.adoc[] + + + +*Type*: `string` + +*Default*: `""` + +```yml +# Examples + +password: foo + +password: ${KEY_PASSWORD} +``` + +=== `sasl` + +Specify one or more methods of SASL authentication. SASL mechanisms are tried in order. If the broker supports the first mechanism, all connections use that mechanism. If the first mechanism fails, the client picks the first supported mechanism. Connections fail if the broker does not support any client mechanisms. + + +*Type*: `array` + + +```yml +# Examples + +sasl: + - mechanism: SCRAM-SHA-512 + password: bar + username: foo +``` + +=== `sasl[].mechanism` + +The SASL mechanism to use. + + +*Type*: `string` + + +|=== +| Option | Summary + +| `AWS_MSK_IAM` +| AWS IAM based authentication as specified by the `aws-msk-iam-auth` java library. +| `OAUTHBEARER` +| OAuth Bearer based authentication. +| `PLAIN` +| Plain text authentication. +| `SCRAM-SHA-256` +| SCRAM based authentication as specified in RFC5802. +| `SCRAM-SHA-512` +| SCRAM based authentication as specified in RFC5802. +| `none` +| Disable sasl authentication + +|=== + +=== `sasl[].username` + +A username to provide for PLAIN or SCRAM-* authentication. + + +*Type*: `string` + +*Default*: `""` + +=== `sasl[].password` + +A password to provide for PLAIN or SCRAM-* authentication. + +include::components:partial$secret_warning.adoc[] + + + +*Type*: `string` + +*Default*: `""` + +=== `sasl[].token` + +The token to use for a single session's OAUTHBEARER authentication. + + +*Type*: `string` + +*Default*: `""` + +=== `sasl[].extensions` + +Key/value pairs to add to OAUTHBEARER authentication requests. + + +*Type*: `object` + + +=== `sasl[].aws` + +Contains AWS specific fields for when the `mechanism` is set to `AWS_MSK_IAM`. + + +*Type*: `object` + + +=== `sasl[].aws.region` + +The AWS region to target. + + +*Type*: `string` + +*Default*: `""` + +=== `sasl[].aws.endpoint` + +Specify a custom endpoint for the AWS API. + + +*Type*: `string` + +*Default*: `""` + +=== `sasl[].aws.credentials` + +Manual configuration of AWS credentials to use (optional). For more information, see xref:guides:cloud/aws.adoc[]. + + +*Type*: `object` + + +=== `sasl[].aws.credentials.profile` + +A profile from `~/.aws/credentials` to use. + + +*Type*: `string` + +*Default*: `""` + +=== `sasl[].aws.credentials.id` + +The ID of credentials to use. + + +*Type*: `string` + +*Default*: `""` + +=== `sasl[].aws.credentials.secret` + +The secret for the credentials being used. + +include::components:partial$secret_warning.adoc[] + + + +*Type*: `string` + +*Default*: `""` + +=== `sasl[].aws.credentials.token` + +The token for the credentials being used, required when using short-term credentials. + + +*Type*: `string` + +*Default*: `""` + +=== `sasl[].aws.credentials.from_ec2_role` + +Use the credentials of a host EC2 machine configured to assume https://docs.aws.amazon.com/IAM/latest/UserGuide/id_roles_use_switch-role-ec2.html[an IAM role associated with the instance^]. + + +*Type*: `bool` + +*Default*: `false` + +ifndef::env-cloud[] +Requires version 4.2.0 or newer +endif::[] + +=== `sasl[].aws.credentials.role` + +A role ARN to assume. + + +*Type*: `string` + +*Default*: `""` + +=== `sasl[].aws.credentials.role_external_id` + +An external ID to provide when assuming a role. + + +*Type*: `string` + +*Default*: `""` + +=== `max_retries` + +The maximum number of retries before giving up on the request. If set to `0`, there is no discrete limit. + + +*Type*: `int` + +*Default*: `0` + +=== `backoff` + +Control the time intervals between retry attempts. + + +*Type*: `object` + + +=== `backoff.initial_interval` + +The initial period to wait between retry attempts. + + +*Type*: `string` + +*Default*: `"1s"` + +=== `backoff.max_interval` + +The maximum period to wait between retry attempts. + + +*Type*: `string` + +*Default*: `"5s"` + +=== `backoff.max_elapsed_time` + +The maximum period to wait before retry attempts are abandoned. If set to `0`, there is no waiting period. + + +*Type*: `string` + +*Default*: `"30s"` + +// end::single-source[] \ No newline at end of file diff --git a/modules/components/pages/processors/ollama_chat.adoc b/modules/components/pages/processors/ollama_chat.adoc index 5abb2022..885a9108 100644 --- a/modules/components/pages/processors/ollama_chat.adoc +++ b/modules/components/pages/processors/ollama_chat.adoc @@ -28,9 +28,15 @@ Common:: # Common config fields, showing default values label: "" ollama_chat: - server_address: http://127.0.0.1:11434 # No default (optional) - model: llama3 # No default (required) + model: llama3.1 # No default (required) prompt: "" # No default (optional) + response_format: text + max_tokens: 0 # No default (optional) + temperature: 0 # No default (optional) + runner: + context_size: 0 # No default (optional) + batch_size: 0 # No default (optional) + server_address: http://127.0.0.1:11434 # No default (optional) ``` -- @@ -42,10 +48,30 @@ Advanced:: # All config fields, showing default values label: "" ollama_chat: - server_address: http://127.0.0.1:11434 # No default (optional) - model: llama3 # No default (required) + model: llama3.1 # No default (required) prompt: "" # No default (optional) system_prompt: "" # No default (optional) + response_format: text + max_tokens: 0 # No default (optional) + temperature: 0 # No default (optional) + num_keep: 0 # No default (optional) + seed: 42 # No default (optional) + top_k: 0 # No default (optional) + top_p: 0 # No default (optional) + repeat_penalty: 0 # No default (optional) + presence_penalty: 0 # No default (optional) + frequency_penalty: 0 # No default (optional) + stop: [] # No default (optional) + runner: + context_size: 0 # No default (optional) + batch_size: 0 # No default (optional) + gpu_layers: 0 # No default (optional) + threads: 0 # No default (optional) + use_mmap: false # No default (optional) + use_mlock: false # No default (optional) + server_address: http://127.0.0.1:11434 # No default (optional) + cache_directory: /opt/cache/connect/ollama # No default (optional) + download_url: "" # No default (optional) ``` -- @@ -59,20 +85,6 @@ For more information, see the https://github.com/ollama/ollama/tree/main/docs[Ol == Fields -=== `server_address` - -The address of the Ollama server to use. Leave the field blank and the processor starts and runs a local Ollama server or specify the address of your own local or remote server. - - -*Type*: `string` - - -```yml -# Examples - -server_address: http://127.0.0.1:11434 -``` - === `model` The name of the Ollama LLM to use. For a full list of models, see the https://ollama.com/models[Ollama website]. @@ -84,7 +96,7 @@ The name of the Ollama LLM to use. For a full list of models, see the https://ol ```yml # Examples -model: llama3 +model: llama3.1 model: gemma2 @@ -110,4 +122,194 @@ This field supports xref:configuration:interpolation.adoc#bloblang-queries[inter *Type*: `string` -// end::single-source[] \ No newline at end of file + +=== `response_format` + +The format of the response that the Ollama model generates. If specifying JSON output, then the `prompt` should specify that the output should be in JSON as well. + + +*Type*: `string` + +*Default*: `"text"` + +Options: +`text` +, `json` +. + +=== `max_tokens` + +The maximum number of tokens to predict and output. Limiting the amount of output means that requests are processed faster and have a fixed limit on the cost. + + +*Type*: `int` + + +=== `temperature` + +The temperature of the model. Increasing the temperature makes the model answer more creatively. + + +*Type*: `int` + + +=== `num_keep` + +Specify the number of tokens from the initial prompt to retain when the model resets its internal context. By default, this value is set to `4`. Use `-1` to retain all tokens from the initial prompt. + + +*Type*: `int` + + +=== `seed` + +Sets the random number seed to use for generation. Setting this to a specific number will make the model generate the same text for the same prompt. + + +*Type*: `int` + + +```yml +# Examples + +seed: 42 +``` + +=== `top_k` + +Reduces the probability of generating nonsense. A higher value, for example `100`, will give more diverse answers. A lower value, for example `10`, will be more conservative. + + +*Type*: `int` + + +=== `top_p` + +Works together with `top-k`. A higher value, for example 0.95, will lead to more diverse text. A lower value, for example 0.5, will generate more focused and conservative text. + + +*Type*: `float` + + +=== `repeat_penalty` + +Sets how strongly to penalize repetitions. A higher value, for example 1.5, will penalize repetitions more strongly. A lower value, for example 0.9, will be more lenient. + + +*Type*: `float` + + +=== `presence_penalty` + +Positive values penalize new tokens if they have appeared in the text so far. This increases the model's likelihood to talk about new topics. + + +*Type*: `float` + + +=== `frequency_penalty` + +Positive values penalize new tokens based on the frequency of their appearance in the text so far. This decreases the model's likelihood to repeat the same line verbatim. + + +*Type*: `float` + + +=== `stop` + +Sets the stop sequences to use. When this pattern is encountered the LLM stops generating text and returns the final response. + + +*Type*: `array` + + +=== `runner` + +Options for the model runner that are used when the model is first loaded into memory. + + +*Type*: `object` + + +=== `runner.context_size` + +Sets the size of the context window used to generate the next token. Using a larger context window uses more memory and takes longer to processor. + + +*Type*: `int` + + +=== `runner.batch_size` + +The maximum number of requests to process in parallel. + + +*Type*: `int` + + +=== `runner.gpu_layers` + +This option allows offloading some layers to the GPU for computation. This generally results in increased performance. By default, the runtime decides the number of layers dynamically. + + +*Type*: `int` + + +=== `runner.threads` + +Set the number of threads to use during generation. For optimal performance, it is recommended to set this value to the number of physical CPU cores your system has. By default, the runtime decides the optimal number of threads. + + +*Type*: `int` + + +=== `runner.use_mmap` + +Map the model into memory. This is only support on unix systems and allows loading only the necessary parts of the model as needed. + + +*Type*: `bool` + + +=== `runner.use_mlock` + +Lock the model in memory, preventing it from being swapped out when memory-mapped. This option can improve performance but reduces some of the advantages of memory-mapping because it uses more RAM to run and can slow down load times as the model loads into RAM. + + +*Type*: `bool` + + +=== `server_address` + +The address of the Ollama server to use. Leave the field blank and the processor starts and runs a local Ollama server or specify the address of your own local or remote server. + + +*Type*: `string` + + +```yml +# Examples + +server_address: http://127.0.0.1:11434 +``` + +=== `cache_directory` + +If `server_address` is not set - the directory to download the ollama binary and use as a model cache. + + +*Type*: `string` + + +```yml +# Examples + +cache_directory: /opt/cache/connect/ollama +``` + +=== `download_url` + +If `server_address` is not set - the URL to download the ollama binary from. Defaults to the offical Ollama GitHub release for this platform. + + +*Type*: `string` diff --git a/modules/components/pages/processors/ollama_embeddings.adoc b/modules/components/pages/processors/ollama_embeddings.adoc index 48587d6d..b1b8cd26 100644 --- a/modules/components/pages/processors/ollama_embeddings.adoc +++ b/modules/components/pages/processors/ollama_embeddings.adoc @@ -17,15 +17,50 @@ ifndef::env-cloud[] Introduced in version 4.32.0. endif::[] +[tabs] +====== +Common:: ++ +-- + ```yml -# Config fields, showing default values +# Common config fields, showing default values label: "" ollama_embeddings: + model: nomic-embed-text # No default (required) + text: "" # No default (optional) + runner: + context_size: 0 # No default (optional) + batch_size: 0 # No default (optional) server_address: http://127.0.0.1:11434 # No default (optional) +``` + +-- +Advanced:: ++ +-- + +```yml +# All config fields, showing default values +label: "" +ollama_embeddings: model: nomic-embed-text # No default (required) text: "" # No default (optional) + runner: + context_size: 0 # No default (optional) + batch_size: 0 # No default (optional) + gpu_layers: 0 # No default (optional) + threads: 0 # No default (optional) + use_mmap: false # No default (optional) + use_mlock: false # No default (optional) + server_address: http://127.0.0.1:11434 # No default (optional) + cache_directory: /opt/cache/connect/ollama # No default (optional) + download_url: "" # No default (optional) ``` +-- +====== + This processor sends text to your chosen Ollama large language model (LLM) and creates vector embeddings, using the Ollama API. Vector embeddings are long arrays of numbers that represent values or objects, in this case text. By default, the processor starts and runs a locally installed Ollama server. Alternatively, to use an already running Ollama server, add your server details to the `server_address` field. You can https://ollama.com/download[download and install Ollama from the Ollama website^]. @@ -34,20 +69,6 @@ For more information, see the https://github.com/ollama/ollama/tree/main/docs[Ol == Fields -=== `server_address` - -The address of the Ollama server to use. Leave the field blank and the processor starts and runs a local Ollama server or specify the address of your own local or remote server. - - -*Type*: `string` - - -```yml -# Examples - -server_address: http://127.0.0.1:11434 -``` - === `model` The name of the Ollama LLM to use. For a full list of models, see the https://ollama.com/models[Ollama website]. @@ -76,4 +97,94 @@ This field supports xref:configuration:interpolation.adoc#bloblang-queries[inter *Type*: `string` -// end::single-source[] \ No newline at end of file + +=== `runner` + +Options for the model runner that are used when the model is first loaded into memory. + + +*Type*: `object` + + +=== `runner.context_size` + +Sets the size of the context window used to generate the next token. Using a larger context window uses more memory and takes longer to processor. + + +*Type*: `int` + + +=== `runner.batch_size` + +The maximum number of requests to process in parallel. + + +*Type*: `int` + + +=== `runner.gpu_layers` + +This option allows offloading some layers to the GPU for computation. This generally results in increased performance. By default, the runtime decides the number of layers dynamically. + + +*Type*: `int` + + +=== `runner.threads` + +Set the number of threads to use during generation. For optimal performance, it is recommended to set this value to the number of physical CPU cores your system has. By default, the runtime decides the optimal number of threads. + + +*Type*: `int` + + +=== `runner.use_mmap` + +Map the model into memory. This is only support on unix systems and allows loading only the necessary parts of the model as needed. + + +*Type*: `bool` + + +=== `runner.use_mlock` + +Lock the model in memory, preventing it from being swapped out when memory-mapped. This option can improve performance but reduces some of the advantages of memory-mapping because it uses more RAM to run and can slow down load times as the model loads into RAM. + + +*Type*: `bool` + + +=== `server_address` + +The address of the Ollama server to use. Leave the field blank and the processor starts and runs a local Ollama server or specify the address of your own local or remote server. + + +*Type*: `string` + + +```yml +# Examples + +server_address: http://127.0.0.1:11434 +``` + +=== `cache_directory` + +If `server_address` is not set - the directory to download the ollama binary and use as a model cache. + + +*Type*: `string` + + +```yml +# Examples + +cache_directory: /opt/cache/connect/ollama +``` + +=== `download_url` + +If `server_address` is not set - the URL to download the ollama binary from. Defaults to the offical Ollama GitHub release for this platform. + + +*Type*: `string` \ No newline at end of file