From d16310ead891dc5d63ab5a1ab78ceacaed8bb98d Mon Sep 17 00:00:00 2001 From: John Kerl Date: Sat, 13 May 2023 11:22:01 -0400 Subject: [PATCH] New `mlr json-parse -k` flag (#1291) * New `mlr json-parse -k` flag * docs / make-dev artifacts * Add unit-test files for the feature * codespell --- docs/src/data-diving-examples.md | 46 +++++++++---------- docs/src/manpage.md | 4 +- docs/src/manpage.txt | 4 +- docs/src/reference-verbs.md | 40 ++++++++-------- docs/src/two-pass-algorithms.md | 4 +- internal/pkg/mlrval/mlrmap_json.go | 8 ++++ internal/pkg/mlrval/mlrval_json.go | 10 ++++ internal/pkg/transformers/json_parse.go | 22 ++++++++- man/manpage.txt | 4 +- man/mlr.1 | 6 ++- test/cases/cli-help/0001/expout | 2 + .../verb-json-parse-json-stringify/0027-k/cmd | 1 + .../0027-k/experr | 0 .../0027-k/expout | 19 ++++++++ .../verb-json-parse-json-stringify/0028-k/cmd | 1 + .../0028-k/experr | 0 .../0028-k/expout | 19 ++++++++ test/input/json-parse-with-error.csv | 4 ++ 18 files changed, 143 insertions(+), 51 deletions(-) create mode 100644 test/cases/verb-json-parse-json-stringify/0027-k/cmd create mode 100644 test/cases/verb-json-parse-json-stringify/0027-k/experr create mode 100644 test/cases/verb-json-parse-json-stringify/0027-k/expout create mode 100644 test/cases/verb-json-parse-json-stringify/0028-k/cmd create mode 100644 test/cases/verb-json-parse-json-stringify/0028-k/experr create mode 100644 test/cases/verb-json-parse-json-stringify/0028-k/expout create mode 100644 test/input/json-parse-with-error.csv diff --git a/docs/src/data-diving-examples.md b/docs/src/data-diving-examples.md index 39738f193..100716ec2 100644 --- a/docs/src/data-diving-examples.md +++ b/docs/src/data-diving-examples.md @@ -160,11 +160,11 @@ CITRUS COUNTY 1332.9 79974.9 483785.1 stats2 -a corr,linreg-ols,r2 -f tiv_2011,tiv_2012
-tiv_2011_tiv_2012_corr  0.9730497632351692
-tiv_2011_tiv_2012_ols_m 0.9835583980337723
-tiv_2011_tiv_2012_ols_b 433854.6428968317
+tiv_2011_tiv_2012_corr  0.9730497632351701
+tiv_2011_tiv_2012_ols_m 0.9835583980337732
+tiv_2011_tiv_2012_ols_b 433854.6428968301
 tiv_2011_tiv_2012_ols_n 36634
-tiv_2011_tiv_2012_r2    0.9468258417320189
+tiv_2011_tiv_2012_r2    0.9468258417320204
 
@@ -322,7 +322,7 @@ Look at bivariate stats by color and shape. In particular, `u,v` pairwise correl
 
           u_v_corr              w_x_corr
-0.1334180491027861 -0.011319841199866178
+0.1334180491027861 -0.011319841199852926
 
@@ -332,22 +332,22 @@ Look at bivariate stats by color and shape. In particular, `u,v` pairwise correl
 
  color    shape              u_v_corr               w_x_corr
-   red   circle    0.9807984401887236   -0.01856553658708754
-orange   square   0.17685855992752927   -0.07104431573806054
- green   circle   0.05764419437577255    0.01179572988801509
-   red   square   0.05574477124893523 -0.0006801456507510942
-yellow triangle   0.04457273771962798   0.024604310103081825
-yellow   square   0.04379172927296089   -0.04462197201631237
-purple   circle   0.03587354936895086     0.1341133954140899
-  blue   square   0.03241153095761164  -0.053507648119643196
-  blue triangle  0.015356427073158766 -0.0006089997461435399
-orange   circle  0.010518953877704048   -0.16279397329279383
-   red triangle   0.00809782571528034   0.012486621357942596
-purple triangle  0.005155190909099334  -0.045057909256220656
-purple   square -0.025680276963377404    0.05769429647930396
- green   square   -0.0257760734502851  -0.003265173252087127
-orange triangle -0.030456661186085785    -0.1318699981926352
-yellow   circle  -0.06477331572781474    0.07369449819706045
-  blue   circle  -0.10234761901929677  -0.030528539069837757
- green triangle  -0.10901825107358765   -0.04848782060162929
+   red   circle    0.9807984401887242  -0.018565536587084836
+orange   square   0.17685855992752933   -0.07104431573805543
+ green   circle   0.05764419437577257   0.011795729888018455
+   red   square    0.0557447712489348 -0.0006801456507506415
+yellow triangle    0.0445727377196281   0.024604310103079844
+yellow   square    0.0437917292729612  -0.044621972016306265
+purple   circle   0.03587354936895115    0.13411339541407613
+  blue   square   0.03241153095761152   -0.05350764811965621
+  blue triangle  0.015356427073158612 -0.0006089997461408209
+orange   circle  0.010518953877704181    -0.1627939732927932
+   red triangle   0.00809782571528054    0.01248662135795501
+purple triangle  0.005155190909099739   -0.04505790925621933
+purple   square  -0.02568027696337717   0.057694296479293694
+ green   square -0.025776073450284875 -0.0032651732520739014
+orange triangle -0.030456661186085584   -0.13186999819263814
+yellow   circle  -0.06477331572781515     0.0736944981970553
+  blue   circle   -0.1023476190192966  -0.030528539069839333
+ green triangle  -0.10901825107358747   -0.04848782060162855
 
diff --git a/docs/src/manpage.md b/docs/src/manpage.md index 2a6845a8c..33325ce9d 100644 --- a/docs/src/manpage.md +++ b/docs/src/manpage.md @@ -1269,6 +1269,8 @@ MILLER(1) MILLER(1) Tries to convert string field values to parsed JSON, e.g. "[1,2,3]" -> [1,2,3]. Options: -f {...} Comma-separated list of field names to json-parse (default all). + -k If supplied, then on parse fail for any cell, keep the (unparsable) + input value for the cell. -h|--help Show this message. 1mjson-stringify0m @@ -3357,5 +3359,5 @@ MILLER(1) MILLER(1) - 2023-04-20 MILLER(1) + 2023-05-13 MILLER(1) diff --git a/docs/src/manpage.txt b/docs/src/manpage.txt index 77149c34b..37d960e3d 100644 --- a/docs/src/manpage.txt +++ b/docs/src/manpage.txt @@ -1248,6 +1248,8 @@ MILLER(1) MILLER(1) Tries to convert string field values to parsed JSON, e.g. "[1,2,3]" -> [1,2,3]. Options: -f {...} Comma-separated list of field names to json-parse (default all). + -k If supplied, then on parse fail for any cell, keep the (unparsable) + input value for the cell. -h|--help Show this message. 1mjson-stringify0m @@ -3336,4 +3338,4 @@ MILLER(1) MILLER(1) - 2023-04-20 MILLER(1) + 2023-05-13 MILLER(1) diff --git a/docs/src/reference-verbs.md b/docs/src/reference-verbs.md index db6c1a644..b0f1d43f5 100644 --- a/docs/src/reference-verbs.md +++ b/docs/src/reference-verbs.md @@ -1792,6 +1792,8 @@ Usage: mlr json-parse [options] Tries to convert string field values to parsed JSON, e.g. "[1,2,3]" -> [1,2,3]. Options: -f {...} Comma-separated list of field names to json-parse (default all). +-k If supplied, then on parse fail for any cell, keep the (unparsable) + input value for the cell. -h|--help Show this message. @@ -3275,14 +3277,14 @@ fields, optionally categorized by one or more fields. data/medium
-x_y_cov    0.000042574820827444476
-x_y_corr   0.0005042001844467462
-y_y_cov    0.08461122467974003
+x_y_cov    0.00004257482082749404
+x_y_corr   0.0005042001844473328
+y_y_cov    0.08461122467974005
 y_y_corr   1
-x2_xy_cov  0.04188382281779374
-x2_xy_corr 0.630174342037994
-x2_y2_cov  -0.00030953725962542085
-x2_y2_corr -0.0034249088761121966
+x2_xy_cov  0.041883822817793716
+x2_xy_corr 0.6301743420379936
+x2_y2_cov  -0.0003095372596253918
+x2_y2_corr -0.003424908876111875
 
@@ -3291,12 +3293,12 @@ x2_y2_corr -0.0034249088761121966
   data/medium
 
-a   x_y_ols_m             x_y_ols_b           x_y_ols_n x_y_r2                  y_y_ols_m y_y_ols_b y_y_ols_n y_y_r2 xy_y2_ols_m        xy_y2_ols_b         xy_y2_ols_n xy_y2_r2
-pan 0.01702551273681908   0.5004028922897639  2081      0.00028691820445814767  1         0         2081      1      0.8781320866715662 0.11908230147563566 2081        0.41749827377311266
-eks 0.0407804923685586    0.48140207967651016 1965      0.0016461239223448587   1         0         1965      1      0.8978728611690183 0.10734054433612333 1965        0.45563223864254526
-wye -0.03915349075204814  0.5255096523974456  1966      0.0015051268704373607   1         0         1966      1      0.8538317334220835 0.1267454301662969  1966        0.38991721818599295
-zee 0.0027812364960399147 0.5043070448033061  2047      0.000007751652858786137 1         0         2047      1      0.8524439912011013 0.12401684308018937 2047        0.39356598090006495
-hat -0.018620577041095078 0.5179005397264935  1941      0.0003520036646055585   1         0         1941      1      0.8412305086345014 0.13557328318623216 1941        0.3687944261732265
+a   x_y_ols_m             x_y_ols_b          x_y_ols_n x_y_r2                  y_y_ols_m y_y_ols_b                           y_y_ols_n y_y_r2 xy_y2_ols_m        xy_y2_ols_b         xy_y2_ols_n xy_y2_r2
+pan 0.017025512736819345  0.500402892289764  2081      0.00028691820445815624  1         -0.00000000000000002890430283104539 2081      1      0.8781320866715664 0.11908230147563569 2081        0.4174982737731127
+eks 0.04078049236855813   0.4814020796765104 1965      0.0016461239223448218   1         0.00000000000000017862676354313703  1965      1      0.897872861169018  0.1073405443361234  1965        0.4556322386425451
+wye -0.03915349075204785  0.5255096523974457 1966      0.0015051268704373377   1         0.00000000000000004464425401127647  1966      1      0.8538317334220837 0.1267454301662969  1966        0.3899172181859931
+zee 0.0027812364960401333 0.5043070448033061 2047      0.000007751652858787357 1         0.00000000000000004819404567023685  2047      1      0.8524439912011011 0.12401684308018947 2047        0.39356598090006495
+hat -0.018620577041095272 0.5179005397264937 1941      0.00035200366460556604  1         -0.00000000000000003400445761787692 1941      1      0.8412305086345017 0.13557328318623207 1941        0.3687944261732266
 
Here's an example simple line-fit. The `x` and `y` @@ -3382,11 +3384,11 @@ upsec_count_pca_quality 0.9999590846136102 donesec 92.33051350964094 color purple -upsec_count_pca_m -39.03009744795354 -upsec_count_pca_b 979.9883413064914 +upsec_count_pca_m -39.030097447953594 +upsec_count_pca_b 979.9883413064917 upsec_count_pca_n 21 upsec_count_pca_quality 0.9999908956206317 -donesec 25.10852919630297 +donesec 25.108529196302943 ## step @@ -3614,9 +3616,9 @@ distinct_count 5 5 10000 10000 10000 mode pan wye 1 0.3467901443380824 0.7268028627434533 sum 0 0 50005000 4986.019681679581 5062.057444929905 mean - - 5000.5 0.49860196816795804 0.5062057444929905 -stddev - - 2886.8956799071675 0.2902925151144007 0.290880086426933 -var - - 8334166.666666667 0.08426974433144456 0.08461122467974003 -skewness - - 0 -0.0006899591185521965 -0.017849760120133784 +stddev - - 2886.8956799071675 0.29029251511440074 0.2908800864269331 +var - - 8334166.666666667 0.08426974433144457 0.08461122467974005 +skewness - - 0 -0.0006899591185517494 -0.01784976012013298 minlen 3 3 1 15 13 maxlen 3 3 5 22 22 min eks eks 1 0.00004509679127584487 0.00008818962627266114 diff --git a/docs/src/two-pass-algorithms.md b/docs/src/two-pass-algorithms.md index 146f3a81e..e475aebf3 100644 --- a/docs/src/two-pass-algorithms.md +++ b/docs/src/two-pass-algorithms.md @@ -598,8 +598,8 @@ hat pan 0.4643355557376876 x_count 10000 x_sum 4986.019681679581 x_mean 0.49860196816795804 -x_var 0.08426974433144456 -x_stddev 0.2902925151144007 +x_var 0.08426974433144457 +x_stddev 0.29029251511440074
diff --git a/internal/pkg/mlrval/mlrmap_json.go b/internal/pkg/mlrval/mlrmap_json.go
index 02238bae5..7b2628ed7 100644
--- a/internal/pkg/mlrval/mlrmap_json.go
+++ b/internal/pkg/mlrval/mlrmap_json.go
@@ -169,6 +169,14 @@ func (entry *MlrmapEntry) JSONParseInPlace() {
 	}
 }
 
+func (entry *MlrmapEntry) JSONTryParseInPlace() {
+	input := entry.Value.String()
+	pmv, err := TryUnmarshalJSON([]byte(input))
+	if err == nil {
+		entry.Value = pmv
+	}
+}
+
 // StringifyValuesRecursively is nominally for the `--jvquoteall` flag.
 func (mlrmap *Mlrmap) StringifyValuesRecursively() {
 	for pe := mlrmap.Head; pe != nil; pe = pe.Next {
diff --git a/internal/pkg/mlrval/mlrval_json.go b/internal/pkg/mlrval/mlrval_json.go
index 087f52338..0b60d0ee5 100644
--- a/internal/pkg/mlrval/mlrval_json.go
+++ b/internal/pkg/mlrval/mlrval_json.go
@@ -114,6 +114,16 @@ func (mv *Mlrval) UnmarshalJSON(inputBytes []byte) error {
 	return nil
 }
 
+// ----------------------------------------------------------------
+func TryUnmarshalJSON(inputBytes []byte) (pmv *Mlrval, err error) {
+	decoder := json.NewDecoder(bytes.NewReader(inputBytes))
+	pmv, eof, err := MlrvalDecodeFromJSON(decoder)
+	if eof {
+		err = fmt.Errorf("mlr: JSON parser: unexpected premature EOF.")
+	}
+	return pmv, err
+}
+
 // ----------------------------------------------------------------
 func MlrvalDecodeFromJSON(decoder *json.Decoder) (
 	mlrval *Mlrval,
diff --git a/internal/pkg/transformers/json_parse.go b/internal/pkg/transformers/json_parse.go
index 4759eb4e8..64904a653 100644
--- a/internal/pkg/transformers/json_parse.go
+++ b/internal/pkg/transformers/json_parse.go
@@ -31,6 +31,8 @@ func transformerJSONParseUsage(
 	)
 	fmt.Fprintf(o, "Options:\n")
 	fmt.Fprintf(o, "-f {...} Comma-separated list of field names to json-parse (default all).\n")
+	fmt.Fprintf(o, "-k       If supplied, then on parse fail for any cell, keep the (unparsable)\n")
+	fmt.Fprintf(o, "         input value for the cell.\n")
 	fmt.Fprintf(o, "-h|--help Show this message.\n")
 }
 
@@ -46,6 +48,7 @@ func transformerJSONParseParseCLI(
 	argi := *pargi
 	verb := args[argi]
 	argi++
+	keepFailed := false
 
 	var fieldNames []string = nil
 
@@ -66,6 +69,9 @@ func transformerJSONParseParseCLI(
 		} else if opt == "-f" {
 			fieldNames = cli.VerbGetStringArrayArgOrDie(verb, opt, args, &argi, argc)
 
+		} else if opt == "-k" {
+			keepFailed = true
+
 		} else {
 			transformerJSONParseUsage(os.Stderr)
 			os.Exit(1)
@@ -79,6 +85,7 @@ func transformerJSONParseParseCLI(
 
 	transformer, err := NewTransformerJSONParse(
 		fieldNames,
+		keepFailed,
 	)
 	if err != nil {
 		fmt.Fprintln(os.Stderr, err)
@@ -92,6 +99,7 @@ func transformerJSONParseParseCLI(
 type TransformerJSONParse struct {
 	// input
 	fieldNameSet map[string]bool
+	keepFailed   bool
 
 	// state
 	recordTransformerFunc RecordTransformerFunc
@@ -99,6 +107,7 @@ type TransformerJSONParse struct {
 
 func NewTransformerJSONParse(
 	fieldNames []string,
+	keepFailed bool,
 ) (*TransformerJSONParse, error) {
 	var fieldNameSet map[string]bool = nil
 	if fieldNames != nil {
@@ -107,6 +116,7 @@ func NewTransformerJSONParse(
 
 	retval := &TransformerJSONParse{
 		fieldNameSet: fieldNameSet,
+		keepFailed:   keepFailed,
 	}
 
 	retval.recordTransformerFunc = retval.jsonParseAll
@@ -139,7 +149,11 @@ func (tr *TransformerJSONParse) jsonParseAll(
 	if !inrecAndContext.EndOfStream {
 		inrec := inrecAndContext.Record
 		for pe := inrec.Head; pe != nil; pe = pe.Next {
-			pe.JSONParseInPlace()
+			if tr.keepFailed {
+				pe.JSONTryParseInPlace()
+			} else {
+				pe.JSONParseInPlace()
+			}
 		}
 		outputRecordsAndContexts.PushBack(inrecAndContext)
 	} else {
@@ -158,7 +172,11 @@ func (tr *TransformerJSONParse) jsonParseSome(
 		inrec := inrecAndContext.Record
 		for pe := inrec.Head; pe != nil; pe = pe.Next {
 			if tr.fieldNameSet[pe.Key] {
-				pe.JSONParseInPlace()
+				if tr.keepFailed {
+					pe.JSONTryParseInPlace()
+				} else {
+					pe.JSONParseInPlace()
+				}
 			}
 		}
 		outputRecordsAndContexts.PushBack(inrecAndContext)
diff --git a/man/manpage.txt b/man/manpage.txt
index 77149c34b..37d960e3d 100644
--- a/man/manpage.txt
+++ b/man/manpage.txt
@@ -1248,6 +1248,8 @@ MILLER(1)                                                            MILLER(1)
        Tries to convert string field values to parsed JSON, e.g. "[1,2,3]" -> [1,2,3].
        Options:
        -f {...} Comma-separated list of field names to json-parse (default all).
+       -k       If supplied, then on parse fail for any cell, keep the (unparsable)
+                input value for the cell.
        -h|--help Show this message.
 
    1mjson-stringify0m
@@ -3336,4 +3338,4 @@ MILLER(1)                                                            MILLER(1)
 
 
 
-                                  2023-04-20                         MILLER(1)
+                                  2023-05-13                         MILLER(1)
diff --git a/man/mlr.1 b/man/mlr.1
index b3545e520..00a67f9ec 100644
--- a/man/mlr.1
+++ b/man/mlr.1
@@ -2,12 +2,12 @@
 .\"     Title: mlr
 .\"    Author: [see the "AUTHOR" section]
 .\" Generator: ./mkman.rb
-.\"      Date: 2023-04-20
+.\"      Date: 2023-05-13
 .\"    Manual: \ \&
 .\"    Source: \ \&
 .\"  Language: English
 .\"
-.TH "MILLER" "1" "2023-04-20" "\ \&" "\ \&"
+.TH "MILLER" "1" "2023-05-13" "\ \&" "\ \&"
 .\" -----------------------------------------------------------------
 .\" * Portability definitions
 .\" ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
@@ -1575,6 +1575,8 @@ Usage: mlr json-parse [options]
 Tries to convert string field values to parsed JSON, e.g. "[1,2,3]" -> [1,2,3].
 Options:
 -f {...} Comma-separated list of field names to json-parse (default all).
+-k       If supplied, then on parse fail for any cell, keep the (unparsable)
+         input value for the cell.
 -h|--help Show this message.
 .fi
 .if n \{\
diff --git a/test/cases/cli-help/0001/expout b/test/cases/cli-help/0001/expout
index 53dd46894..374275e77 100644
--- a/test/cases/cli-help/0001/expout
+++ b/test/cases/cli-help/0001/expout
@@ -415,6 +415,8 @@ Usage: mlr json-parse [options]
 Tries to convert string field values to parsed JSON, e.g. "[1,2,3]" -> [1,2,3].
 Options:
 -f {...} Comma-separated list of field names to json-parse (default all).
+-k       If supplied, then on parse fail for any cell, keep the (unparsable)
+         input value for the cell.
 -h|--help Show this message.
 
 ================================================================
diff --git a/test/cases/verb-json-parse-json-stringify/0027-k/cmd b/test/cases/verb-json-parse-json-stringify/0027-k/cmd
new file mode 100644
index 000000000..143ccab1a
--- /dev/null
+++ b/test/cases/verb-json-parse-json-stringify/0027-k/cmd
@@ -0,0 +1 @@
+mlr --c2j json-parse test/input/json-parse-with-error.csv
diff --git a/test/cases/verb-json-parse-json-stringify/0027-k/experr b/test/cases/verb-json-parse-json-stringify/0027-k/experr
new file mode 100644
index 000000000..e69de29bb
diff --git a/test/cases/verb-json-parse-json-stringify/0027-k/expout b/test/cases/verb-json-parse-json-stringify/0027-k/expout
new file mode 100644
index 000000000..9316778f3
--- /dev/null
+++ b/test/cases/verb-json-parse-json-stringify/0027-k/expout
@@ -0,0 +1,19 @@
+[
+{
+  "foo": "ts",
+  "bar": "some value",
+  "baz": 10
+},
+{
+  "foo": "ts",
+  "bar": (error),
+  "baz": 10
+},
+{
+  "foo": "ts",
+  "bar": {
+    "key": "val"
+  },
+  "baz": 10
+}
+]
diff --git a/test/cases/verb-json-parse-json-stringify/0028-k/cmd b/test/cases/verb-json-parse-json-stringify/0028-k/cmd
new file mode 100644
index 000000000..2e27bc2ee
--- /dev/null
+++ b/test/cases/verb-json-parse-json-stringify/0028-k/cmd
@@ -0,0 +1 @@
+mlr --c2j json-parse -k test/input/json-parse-with-error.csv
diff --git a/test/cases/verb-json-parse-json-stringify/0028-k/experr b/test/cases/verb-json-parse-json-stringify/0028-k/experr
new file mode 100644
index 000000000..e69de29bb
diff --git a/test/cases/verb-json-parse-json-stringify/0028-k/expout b/test/cases/verb-json-parse-json-stringify/0028-k/expout
new file mode 100644
index 000000000..25ff4be98
--- /dev/null
+++ b/test/cases/verb-json-parse-json-stringify/0028-k/expout
@@ -0,0 +1,19 @@
+[
+{
+  "foo": "ts",
+  "bar": "some value",
+  "baz": 10
+},
+{
+  "foo": "ts",
+  "bar": "{key:\"val\"}",
+  "baz": 10
+},
+{
+  "foo": "ts",
+  "bar": {
+    "key": "val"
+  },
+  "baz": 10
+}
+]
diff --git a/test/input/json-parse-with-error.csv b/test/input/json-parse-with-error.csv
new file mode 100644
index 000000000..76f9bf71f
--- /dev/null
+++ b/test/input/json-parse-with-error.csv
@@ -0,0 +1,4 @@
+foo,bar,baz
+"""ts""","""some value""",10
+"""ts""","{key:""val""}",10
+"""ts""","{""key"":""val""}",10