Skip to content

Commit

Permalink
add config to disable GZIP encoding in http requests
Browse files Browse the repository at this point in the history
  • Loading branch information
Johannes Bareuther committed Aug 28, 2024
1 parent 2d93f2f commit 71ac789
Show file tree
Hide file tree
Showing 4 changed files with 14 additions and 4 deletions.
1 change: 1 addition & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -183,6 +183,7 @@ Configuration happens through environment variables only.
| `TES_NO_HTTP` | If `true` and `TES_EXPOSE_NATS` is `true`, too, no HTTP server is started |
| `TES_REMOVE_NEWLINES` | If true, extracted text will be compacted by replacing newlines with whitespace (Default: `true`). |
| `TES_FORK_THRESHOLD` | Maximum content length (size in bytes) of a file that is being converted in-process rather than by a subprocess in fork-exec style. Choose a negative value to disable forking. Default: 2 MiB |
| `TES_HTTP_CLIENT_DISABLE_COMPRESSION` | Disable `Accept-Encoding: gzip` header in outgoing HTTP Requests. Default: false |

## Usage

Expand Down
2 changes: 2 additions & 0 deletions config.go
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,8 @@ type TesConfig struct {
// Maximum content length (size in bytes) of a file that is being converted in-process
// rather by a subprocess in fork-exec style. Default: 2 MiB
ForkThreshold int64 `env:"TES_FORK_THRESHOLD" default:"2097152"`
// Disable Accept-Encoding=gzip header in outgoing HTTP Requests
HttpClientDisableCompression bool `env:"TES_HTTP_CLIENT_DISABLE_COMPRESSION" default:"false"`
// NATS max msg size (embedded server only)
NatsMaxPayload int32 `env:"TES_MAX_PAYLOAD" default:"8388608"`
// embedded NATS server storage location. Default: /tmp/nats
Expand Down
7 changes: 5 additions & 2 deletions extract.go
Original file line number Diff line number Diff line change
Expand Up @@ -18,7 +18,9 @@ import (

type DocumentMetadata map[string]string

var validate *validator.Validate
var (
validate *validator.Validate
)

// ExtractedDocument contains pointers to metadata, textual content and URL of origin
type ExtractedDocument struct {
Expand Down Expand Up @@ -101,7 +103,8 @@ func DocFromUrl(params RequestParams, w io.Writer, header http.Header) (status i
}
}
logger.Debug("Issuing conditional GET request", "url", url, "headers", req.Header)
response, err := http.DefaultClient.Do(req)

response, err := httpClient.Do(req)
if err != nil {
logger.Error("Error fetching", "err", err, "url", url)
return http.StatusNotFound, err
Expand Down
8 changes: 6 additions & 2 deletions main.go
Original file line number Diff line number Diff line change
Expand Up @@ -17,11 +17,12 @@ var (
cache Cache
cacheNop bool
closeDocChan chan Document
pdfImplementation string // Which lib is being used for PDFs?
logger *slog.Logger
pdfImplementation string // Which lib is being used for PDFs?
logger *slog.Logger
saveExtractedDocChan chan *ExtractedDocument
srv http.Server
tesConfig TesConfig
httpClient *http.Client
)

func main() {
Expand Down Expand Up @@ -80,6 +81,9 @@ func main() {
if !docparser.Initialized {
logger.Warn("wvWare is not available in PATH. We will not be able to extract legacy MS Word documents.")
}
httpClient = &http.Client{
Transport: &http.Transport{DisableCompression: tesConfig.HttpClientDisableCompression},
}
logger.Info("Service started", "address", srv.Addr)
defer logger.Info("HTTP Server stopped.")
if err := srv.ListenAndServe(); err != http.ErrServerClosed {
Expand Down

0 comments on commit 71ac789

Please sign in to comment.