diff --git a/.circleci/config.yml b/.circleci/config.yml new file mode 100644 index 000000000..32e5a1e18 --- /dev/null +++ b/.circleci/config.yml @@ -0,0 +1,138 @@ +version: 2 + +_defaults: &defaults + working_directory: ~/repo + environment: + TERM: dumb + docker: + - image: circleci/openjdk:8-jdk + +_setenv: &setenv + name: set CloudRepo credentials + command: |- + [ -d $HOME/.sbt ] || mkdir $HOME/.sbt + printf "realm=s22s.mycloudrepo.io\nhost=s22s.mycloudrepo.io\nuser=$CLOUDREPO_USER\npassword=$CLOUDREPO_PASSWORD\n" > $HOME/.sbt/.credentials + +_delenv: &unsetenv + name: delete CloudRepo credential + command: rm -rf $HOME/.sbt/.credentials || true + +_restore_cache: &restore_cache + keys: + - v2-dependencies-{{ checksum "build.sbt" }} + - v2-dependencies- + +_save_cache: &save_cache + key: v2-dependencies--{{ checksum "build.sbt" }} + paths: + - ~/.ivy2/cache + - ~/.sbt + - ~/.rf_cache + +jobs: + staticAnalysis: + <<: *defaults + + steps: + - checkout + - run: *setenv + - restore_cache: + <<: *restore_cache + + - run: cat /dev/null | sbt dependencyCheck + - run: cat /dev/null | sbt --debug dumpLicenseReport + + - run: *unsetenv + + - save_cache: + <<: *save_cache + - store_artifacts: + path: datasource/target/scala-2.11/dependency-check-report.html + destination: dependency-check-report-datasource.html + - store_artifacts: + path: experimental/target/scala-2.11/dependency-check-report.html + destination: dependency-check-report-experimental.html + - store_artifacts: + path: core/target/scala-2.11/dependency-check-report.html + destination: dependency-check-report-core.html + - store_artifacts: + path: pyrasterframes/target/scala-2.11/dependency-check-report.html + destination: dependency-check-report-pyrasterframes.html + + test: + <<: *defaults + resource_class: large + steps: + - checkout + - run: *setenv + - restore_cache: + <<: *restore_cache + + - run: sudo apt-get install python-pip pandoc && pip install setuptools # required for pyrasterframes testing + - run: cat /dev/null | sbt test + + - run: *unsetenv + - save_cache: + <<: *save_cache + + publish: + <<: *defaults + resource_class: large + steps: + - checkout + - run: *setenv + - restore_cache: + <<: *restore_cache + + - run: sudo apt-get install python-pip pandoc && pip install setuptools # required for pyrasterframes testing + - run: cat /dev/null | sbt test + - run: cat /dev/null | sbt publish + + - run: *unsetenv + - save_cache: + <<: *save_cache + + it: + <<: *defaults + resource_class: xlarge + steps: + - checkout + - run: *setenv + + - restore_cache: + <<: *restore_cache + + - run: + command: cat /dev/null | sbt it:test + no_output_timeout: 30m + - run: *unsetenv + + - save_cache: + <<: *save_cache + +workflows: + version: 2 + all: + jobs: + - test + - it: + filters: + branches: + only: + - /astraea\/feature\/.*-its/ + - publish: + filters: + branches: + only: + - astraea/develop + nightlyReleaseAstraea: + triggers: + - schedule: + cron: "0 8 * * *" + filters: + branches: + only: + - astraea/develop + jobs: + - it + - staticAnalysis diff --git a/.scalafmt.conf b/.scalafmt.conf index 61f56e01e..4d09e93c7 100644 --- a/.scalafmt.conf +++ b/.scalafmt.conf @@ -1,5 +1,7 @@ -maxColumn = 100 +maxColumn = 138 continuationIndent.defnSite = 2 +continuationIndent.callSite = 2 +continuationIndent.extendSite = 2 binPack.parentConstructors = true binPack.literalArgumentLists = false binPack.unsafeCallSite = true diff --git a/.travis.yml b/.travis.yml index b83a56286..fbe2823fa 100644 --- a/.travis.yml +++ b/.travis.yml @@ -5,7 +5,8 @@ language: scala cache: directories: - $HOME/.ivy2/cache - - $HOME/.sbt/boot/ + - $HOME/.sbt/boot + - $HOME/.rf_cache scala: - 2.11.11 @@ -14,17 +15,23 @@ jdk: - oraclejdk8 addons: - apt_packages: - - pandoc + apt: + packages: + - pandoc + - python-pip + +install: + - pip install setuptools sbt_args: -no-colors script: - - sbt -Dfile.encoding=UTF8 clean core/test datasource/test + - sbt test + - sbt it:test # - sbt -Dfile.encoding=UTF8 clean coverage test coverageReport # Tricks to avoid unnecessary cache updates - find $HOME/.sbt -name "*.lock" | xargs rm - find $HOME/.ivy2 -name "ivydata-*.properties" | xargs rm #after_success: -# - bash <(curl -s https://codecov.io/bash) +# - bash <(curl -s https://codecov.io/bash) \ No newline at end of file diff --git a/README.md b/README.md index 8644f40a5..dddeb94ae 100644 --- a/README.md +++ b/README.md @@ -1,6 +1,5 @@ -[![Build Status](https://travis-ci.org/locationtech/rasterframes.svg?branch=master)](https://travis-ci.org/s22s/raster-frames) [![Join the chat at https://gitter.im/s22s/raster-frames](https://badges.gitter.im/s22s/raster-frames.svg)](https://gitter.im/s22s/raster-frames?utm_source=badge&utm_medium=badge&utm_campaign=pr-badge&utm_content=badge) _RasterFrames™_ brings the power of Spark DataFrames to geospatial raster data, empowered by the map algebra and tile layer operations of [GeoTrellis](https://geotrellis.io/). diff --git a/bench/archive/jmh-results-20190206135840.json b/bench/archive/jmh-results-20190206135840.json new file mode 100644 index 000000000..958dad131 --- /dev/null +++ b/bench/archive/jmh-results-20190206135840.json @@ -0,0 +1,1124 @@ +[ + { + "jmhVersion" : "1.21", + "benchmark" : "astraea.spark.rasterframes.bench.TileEncodeBench.encode", + "mode" : "avgt", + "threads" : 1, + "forks" : 1, + "jvm" : "/Library/Java/JavaVirtualMachines/jdk1.8.0_171.jdk/Contents/Home/jre/bin/java", + "jvmArgs" : [ + "-Xmx4g" + ], + "jdkVersion" : "1.8.0_171", + "vmName" : "Java HotSpot(TM) 64-Bit Server VM", + "vmVersion" : "25.171-b11", + "warmupIterations" : 8, + "warmupTime" : "10 s", + "warmupBatchSize" : 1, + "measurementIterations" : 5, + "measurementTime" : "10 s", + "measurementBatchSize" : 1, + "params" : { + "cellTypeName" : "uint8", + "tileSize" : "64" + }, + "primaryMetric" : { + "score" : 0.750756701914575, + "scoreError" : 0.03489921840300597, + "scoreConfidence" : [ + 0.715857483511569, + 0.785655920317581 + ], + "scorePercentiles" : { + "0.0" : 0.7386746014362018, + "50.0" : 0.752125418246387, + "90.0" : 0.7593754045781637, + "95.0" : 0.7593754045781637, + "99.0" : 0.7593754045781637, + "99.9" : 0.7593754045781637, + "99.99" : 0.7593754045781637, + "99.999" : 0.7593754045781637, + "99.9999" : 0.7593754045781637, + "100.0" : 0.7593754045781637 + }, + "scoreUnit" : "us/op", + "rawData" : [ + [ + 0.7386746014362018, + 0.7590235001754141, + 0.7593754045781637, + 0.7445845851367084, + 0.752125418246387 + ] + ] + }, + "secondaryMetrics" : { + } + }, + { + "jmhVersion" : "1.21", + "benchmark" : "astraea.spark.rasterframes.bench.TileEncodeBench.encode", + "mode" : "avgt", + "threads" : 1, + "forks" : 1, + "jvm" : "/Library/Java/JavaVirtualMachines/jdk1.8.0_171.jdk/Contents/Home/jre/bin/java", + "jvmArgs" : [ + "-Xmx4g" + ], + "jdkVersion" : "1.8.0_171", + "vmName" : "Java HotSpot(TM) 64-Bit Server VM", + "vmVersion" : "25.171-b11", + "warmupIterations" : 8, + "warmupTime" : "10 s", + "warmupBatchSize" : 1, + "measurementIterations" : 5, + "measurementTime" : "10 s", + "measurementBatchSize" : 1, + "params" : { + "cellTypeName" : "uint8", + "tileSize" : "512" + }, + "primaryMetric" : { + "score" : 38.38058292433662, + "scoreError" : 0.5766293256970553, + "scoreConfidence" : [ + 37.803953598639566, + 38.95721225003368 + ], + "scorePercentiles" : { + "0.0" : 38.17950849889709, + "50.0" : 38.483487036965805, + "90.0" : 38.49375297517132, + "95.0" : 38.49375297517132, + "99.0" : 38.49375297517132, + "99.9" : 38.49375297517132, + "99.99" : 38.49375297517132, + "99.999" : 38.49375297517132, + "99.9999" : 38.49375297517132, + "100.0" : 38.49375297517132 + }, + "scoreUnit" : "us/op", + "rawData" : [ + [ + 38.17950849889709, + 38.25959587406489, + 38.483487036965805, + 38.49375297517132, + 38.48657023658396 + ] + ] + }, + "secondaryMetrics" : { + } + }, + { + "jmhVersion" : "1.21", + "benchmark" : "astraea.spark.rasterframes.bench.TileEncodeBench.encode", + "mode" : "avgt", + "threads" : 1, + "forks" : 1, + "jvm" : "/Library/Java/JavaVirtualMachines/jdk1.8.0_171.jdk/Contents/Home/jre/bin/java", + "jvmArgs" : [ + "-Xmx4g" + ], + "jdkVersion" : "1.8.0_171", + "vmName" : "Java HotSpot(TM) 64-Bit Server VM", + "vmVersion" : "25.171-b11", + "warmupIterations" : 8, + "warmupTime" : "10 s", + "warmupBatchSize" : 1, + "measurementIterations" : 5, + "measurementTime" : "10 s", + "measurementBatchSize" : 1, + "params" : { + "cellTypeName" : "int32", + "tileSize" : "64" + }, + "primaryMetric" : { + "score" : 6.859730529387528, + "scoreError" : 0.15015114495123957, + "scoreConfidence" : [ + 6.709579384436289, + 7.009881674338768 + ], + "scorePercentiles" : { + "0.0" : 6.807625212637563, + "50.0" : 6.851807420817827, + "90.0" : 6.912795585073158, + "95.0" : 6.912795585073158, + "99.0" : 6.912795585073158, + "99.9" : 6.912795585073158, + "99.99" : 6.912795585073158, + "99.999" : 6.912795585073158, + "99.9999" : 6.912795585073158, + "100.0" : 6.912795585073158 + }, + "scoreUnit" : "us/op", + "rawData" : [ + [ + 6.851807420817827, + 6.847977811681636, + 6.912795585073158, + 6.807625212637563, + 6.878446616727458 + ] + ] + }, + "secondaryMetrics" : { + } + }, + { + "jmhVersion" : "1.21", + "benchmark" : "astraea.spark.rasterframes.bench.TileEncodeBench.encode", + "mode" : "avgt", + "threads" : 1, + "forks" : 1, + "jvm" : "/Library/Java/JavaVirtualMachines/jdk1.8.0_171.jdk/Contents/Home/jre/bin/java", + "jvmArgs" : [ + "-Xmx4g" + ], + "jdkVersion" : "1.8.0_171", + "vmName" : "Java HotSpot(TM) 64-Bit Server VM", + "vmVersion" : "25.171-b11", + "warmupIterations" : 8, + "warmupTime" : "10 s", + "warmupBatchSize" : 1, + "measurementIterations" : 5, + "measurementTime" : "10 s", + "measurementBatchSize" : 1, + "params" : { + "cellTypeName" : "int32", + "tileSize" : "512" + }, + "primaryMetric" : { + "score" : 480.9422408208602, + "scoreError" : 5.771651213055483, + "scoreConfidence" : [ + 475.17058960780474, + 486.71389203391567 + ], + "scorePercentiles" : { + "0.0" : 478.50027097977335, + "50.0" : 481.1490574795575, + "90.0" : 482.35979851501855, + "95.0" : 482.35979851501855, + "99.0" : 482.35979851501855, + "99.9" : 482.35979851501855, + "99.99" : 482.35979851501855, + "99.999" : 482.35979851501855, + "99.9999" : 482.35979851501855, + "100.0" : 482.35979851501855 + }, + "scoreUnit" : "us/op", + "rawData" : [ + [ + 478.50027097977335, + 481.91533816448384, + 481.1490574795575, + 482.35979851501855, + 480.7867389654676 + ] + ] + }, + "secondaryMetrics" : { + } + }, + { + "jmhVersion" : "1.21", + "benchmark" : "astraea.spark.rasterframes.bench.TileEncodeBench.encode", + "mode" : "avgt", + "threads" : 1, + "forks" : 1, + "jvm" : "/Library/Java/JavaVirtualMachines/jdk1.8.0_171.jdk/Contents/Home/jre/bin/java", + "jvmArgs" : [ + "-Xmx4g" + ], + "jdkVersion" : "1.8.0_171", + "vmName" : "Java HotSpot(TM) 64-Bit Server VM", + "vmVersion" : "25.171-b11", + "warmupIterations" : 8, + "warmupTime" : "10 s", + "warmupBatchSize" : 1, + "measurementIterations" : 5, + "measurementTime" : "10 s", + "measurementBatchSize" : 1, + "params" : { + "cellTypeName" : "float32", + "tileSize" : "64" + }, + "primaryMetric" : { + "score" : 6.818078360307711, + "scoreError" : 0.08318892187153049, + "scoreConfidence" : [ + 6.734889438436181, + 6.901267282179241 + ], + "scorePercentiles" : { + "0.0" : 6.7948532935574955, + "50.0" : 6.811025113390723, + "90.0" : 6.846424033994792, + "95.0" : 6.846424033994792, + "99.0" : 6.846424033994792, + "99.9" : 6.846424033994792, + "99.99" : 6.846424033994792, + "99.999" : 6.846424033994792, + "99.9999" : 6.846424033994792, + "100.0" : 6.846424033994792 + }, + "scoreUnit" : "us/op", + "rawData" : [ + [ + 6.803675604268625, + 6.811025113390723, + 6.7948532935574955, + 6.834413756326912, + 6.846424033994792 + ] + ] + }, + "secondaryMetrics" : { + } + }, + { + "jmhVersion" : "1.21", + "benchmark" : "astraea.spark.rasterframes.bench.TileEncodeBench.encode", + "mode" : "avgt", + "threads" : 1, + "forks" : 1, + "jvm" : "/Library/Java/JavaVirtualMachines/jdk1.8.0_171.jdk/Contents/Home/jre/bin/java", + "jvmArgs" : [ + "-Xmx4g" + ], + "jdkVersion" : "1.8.0_171", + "vmName" : "Java HotSpot(TM) 64-Bit Server VM", + "vmVersion" : "25.171-b11", + "warmupIterations" : 8, + "warmupTime" : "10 s", + "warmupBatchSize" : 1, + "measurementIterations" : 5, + "measurementTime" : "10 s", + "measurementBatchSize" : 1, + "params" : { + "cellTypeName" : "float32", + "tileSize" : "512" + }, + "primaryMetric" : { + "score" : 493.1200339402855, + "scoreError" : 5.741953611582008, + "scoreConfidence" : [ + 487.3780803287035, + 498.8619875518675 + ], + "scorePercentiles" : { + "0.0" : 490.7129639878324, + "50.0" : 493.6863954119388, + "90.0" : 494.3884258252619, + "95.0" : 494.3884258252619, + "99.0" : 494.3884258252619, + "99.9" : 494.3884258252619, + "99.99" : 494.3884258252619, + "99.999" : 494.3884258252619, + "99.9999" : 494.3884258252619, + "100.0" : 494.3884258252619 + }, + "scoreUnit" : "us/op", + "rawData" : [ + [ + 493.6863954119388, + 492.6966235971648, + 490.7129639878324, + 494.3884258252619, + 494.1157608792294 + ] + ] + }, + "secondaryMetrics" : { + } + }, + { + "jmhVersion" : "1.21", + "benchmark" : "astraea.spark.rasterframes.bench.TileEncodeBench.encode", + "mode" : "avgt", + "threads" : 1, + "forks" : 1, + "jvm" : "/Library/Java/JavaVirtualMachines/jdk1.8.0_171.jdk/Contents/Home/jre/bin/java", + "jvmArgs" : [ + "-Xmx4g" + ], + "jdkVersion" : "1.8.0_171", + "vmName" : "Java HotSpot(TM) 64-Bit Server VM", + "vmVersion" : "25.171-b11", + "warmupIterations" : 8, + "warmupTime" : "10 s", + "warmupBatchSize" : 1, + "measurementIterations" : 5, + "measurementTime" : "10 s", + "measurementBatchSize" : 1, + "params" : { + "cellTypeName" : "float64", + "tileSize" : "64" + }, + "primaryMetric" : { + "score" : 14.642728171265563, + "scoreError" : 0.3786592031928636, + "scoreConfidence" : [ + 14.2640689680727, + 15.021387374458428 + ], + "scorePercentiles" : { + "0.0" : 14.547127458647054, + "50.0" : 14.617635183386758, + "90.0" : 14.792616763059605, + "95.0" : 14.792616763059605, + "99.0" : 14.792616763059605, + "99.9" : 14.792616763059605, + "99.99" : 14.792616763059605, + "99.999" : 14.792616763059605, + "99.9999" : 14.792616763059605, + "100.0" : 14.792616763059605 + }, + "scoreUnit" : "us/op", + "rawData" : [ + [ + 14.547127458647054, + 14.617635183386758, + 14.573274428428881, + 14.792616763059605, + 14.682987022805511 + ] + ] + }, + "secondaryMetrics" : { + } + }, + { + "jmhVersion" : "1.21", + "benchmark" : "astraea.spark.rasterframes.bench.TileEncodeBench.encode", + "mode" : "avgt", + "threads" : 1, + "forks" : 1, + "jvm" : "/Library/Java/JavaVirtualMachines/jdk1.8.0_171.jdk/Contents/Home/jre/bin/java", + "jvmArgs" : [ + "-Xmx4g" + ], + "jdkVersion" : "1.8.0_171", + "vmName" : "Java HotSpot(TM) 64-Bit Server VM", + "vmVersion" : "25.171-b11", + "warmupIterations" : 8, + "warmupTime" : "10 s", + "warmupBatchSize" : 1, + "measurementIterations" : 5, + "measurementTime" : "10 s", + "measurementBatchSize" : 1, + "params" : { + "cellTypeName" : "float64", + "tileSize" : "512" + }, + "primaryMetric" : { + "score" : 1080.8359386830703, + "scoreError" : 35.03166838916621, + "scoreConfidence" : [ + 1045.804270293904, + 1115.8676070722365 + ], + "scorePercentiles" : { + "0.0" : 1071.5023248018847, + "50.0" : 1078.4063772364734, + "90.0" : 1092.8710304751503, + "95.0" : 1092.8710304751503, + "99.0" : 1092.8710304751503, + "99.9" : 1092.8710304751503, + "99.99" : 1092.8710304751503, + "99.999" : 1092.8710304751503, + "99.9999" : 1092.8710304751503, + "100.0" : 1092.8710304751503 + }, + "scoreUnit" : "us/op", + "rawData" : [ + [ + 1071.5023248018847, + 1073.8747796864934, + 1092.8710304751503, + 1087.5251812153494, + 1078.4063772364734 + ] + ] + }, + "secondaryMetrics" : { + } + }, + { + "jmhVersion" : "1.21", + "benchmark" : "astraea.spark.rasterframes.bench.TileEncodeBench.encode", + "mode" : "avgt", + "threads" : 1, + "forks" : 1, + "jvm" : "/Library/Java/JavaVirtualMachines/jdk1.8.0_171.jdk/Contents/Home/jre/bin/java", + "jvmArgs" : [ + "-Xmx4g" + ], + "jdkVersion" : "1.8.0_171", + "vmName" : "Java HotSpot(TM) 64-Bit Server VM", + "vmVersion" : "25.171-b11", + "warmupIterations" : 8, + "warmupTime" : "10 s", + "warmupBatchSize" : 1, + "measurementIterations" : 5, + "measurementTime" : "10 s", + "measurementBatchSize" : 1, + "params" : { + "cellTypeName" : "rasterRef", + "tileSize" : "64" + }, + "primaryMetric" : { + "score" : 12.188313148621546, + "scoreError" : 7.365297251370428, + "scoreConfidence" : [ + 4.823015897251119, + 19.553610399991975 + ], + "scorePercentiles" : { + "0.0" : 11.186523380550728, + "50.0" : 11.374585349686248, + "90.0" : 15.606331373536978, + "95.0" : 15.606331373536978, + "99.0" : 15.606331373536978, + "99.9" : 15.606331373536978, + "99.99" : 15.606331373536978, + "99.999" : 15.606331373536978, + "99.9999" : 15.606331373536978, + "100.0" : 15.606331373536978 + }, + "scoreUnit" : "us/op", + "rawData" : [ + [ + 11.186523380550728, + 11.357274436209732, + 11.374585349686248, + 11.416851203124038, + 15.606331373536978 + ] + ] + }, + "secondaryMetrics" : { + } + }, + { + "jmhVersion" : "1.21", + "benchmark" : "astraea.spark.rasterframes.bench.TileEncodeBench.encode", + "mode" : "avgt", + "threads" : 1, + "forks" : 1, + "jvm" : "/Library/Java/JavaVirtualMachines/jdk1.8.0_171.jdk/Contents/Home/jre/bin/java", + "jvmArgs" : [ + "-Xmx4g" + ], + "jdkVersion" : "1.8.0_171", + "vmName" : "Java HotSpot(TM) 64-Bit Server VM", + "vmVersion" : "25.171-b11", + "warmupIterations" : 8, + "warmupTime" : "10 s", + "warmupBatchSize" : 1, + "measurementIterations" : 5, + "measurementTime" : "10 s", + "measurementBatchSize" : 1, + "params" : { + "cellTypeName" : "rasterRef", + "tileSize" : "512" + }, + "primaryMetric" : { + "score" : 11.16416039038066, + "scoreError" : 0.6904470565404526, + "scoreConfidence" : [ + 10.473713333840207, + 11.854607446921113 + ], + "scorePercentiles" : { + "0.0" : 10.958926899271512, + "50.0" : 11.144335208904712, + "90.0" : 11.379572838567773, + "95.0" : 11.379572838567773, + "99.0" : 11.379572838567773, + "99.9" : 11.379572838567773, + "99.99" : 11.379572838567773, + "99.999" : 11.379572838567773, + "99.9999" : 11.379572838567773, + "100.0" : 11.379572838567773 + }, + "scoreUnit" : "us/op", + "rawData" : [ + [ + 11.30976879591712, + 11.379572838567773, + 11.144335208904712, + 11.02819820924218, + 10.958926899271512 + ] + ] + }, + "secondaryMetrics" : { + } + }, + { + "jmhVersion" : "1.21", + "benchmark" : "astraea.spark.rasterframes.bench.TileEncodeBench.roundTrip", + "mode" : "avgt", + "threads" : 1, + "forks" : 1, + "jvm" : "/Library/Java/JavaVirtualMachines/jdk1.8.0_171.jdk/Contents/Home/jre/bin/java", + "jvmArgs" : [ + "-Xmx4g" + ], + "jdkVersion" : "1.8.0_171", + "vmName" : "Java HotSpot(TM) 64-Bit Server VM", + "vmVersion" : "25.171-b11", + "warmupIterations" : 8, + "warmupTime" : "10 s", + "warmupBatchSize" : 1, + "measurementIterations" : 5, + "measurementTime" : "10 s", + "measurementBatchSize" : 1, + "params" : { + "cellTypeName" : "uint8", + "tileSize" : "64" + }, + "primaryMetric" : { + "score" : 4.762673330888573, + "scoreError" : 0.3629765650696548, + "scoreConfidence" : [ + 4.399696765818918, + 5.125649895958228 + ], + "scorePercentiles" : { + "0.0" : 4.654367944695545, + "50.0" : 4.771804106710553, + "90.0" : 4.888312020456609, + "95.0" : 4.888312020456609, + "99.0" : 4.888312020456609, + "99.9" : 4.888312020456609, + "99.99" : 4.888312020456609, + "99.999" : 4.888312020456609, + "99.9999" : 4.888312020456609, + "100.0" : 4.888312020456609 + }, + "scoreUnit" : "us/op", + "rawData" : [ + [ + 4.888312020456609, + 4.771804106710553, + 4.654367944695545, + 4.811061175897903, + 4.687821406682261 + ] + ] + }, + "secondaryMetrics" : { + } + }, + { + "jmhVersion" : "1.21", + "benchmark" : "astraea.spark.rasterframes.bench.TileEncodeBench.roundTrip", + "mode" : "avgt", + "threads" : 1, + "forks" : 1, + "jvm" : "/Library/Java/JavaVirtualMachines/jdk1.8.0_171.jdk/Contents/Home/jre/bin/java", + "jvmArgs" : [ + "-Xmx4g" + ], + "jdkVersion" : "1.8.0_171", + "vmName" : "Java HotSpot(TM) 64-Bit Server VM", + "vmVersion" : "25.171-b11", + "warmupIterations" : 8, + "warmupTime" : "10 s", + "warmupBatchSize" : 1, + "measurementIterations" : 5, + "measurementTime" : "10 s", + "measurementBatchSize" : 1, + "params" : { + "cellTypeName" : "uint8", + "tileSize" : "512" + }, + "primaryMetric" : { + "score" : 125.38841234041345, + "scoreError" : 12.251197281994925, + "scoreConfidence" : [ + 113.13721505841853, + 137.6396096224084 + ], + "scorePercentiles" : { + "0.0" : 121.32589970894979, + "50.0" : 126.16043763634758, + "90.0" : 129.26154882555105, + "95.0" : 129.26154882555105, + "99.0" : 129.26154882555105, + "99.9" : 129.26154882555105, + "99.99" : 129.26154882555105, + "99.999" : 129.26154882555105, + "99.9999" : 129.26154882555105, + "100.0" : 129.26154882555105 + }, + "scoreUnit" : "us/op", + "rawData" : [ + [ + 126.16043763634758, + 127.1243125222386, + 129.26154882555105, + 123.06986300898019, + 121.32589970894979 + ] + ] + }, + "secondaryMetrics" : { + } + }, + { + "jmhVersion" : "1.21", + "benchmark" : "astraea.spark.rasterframes.bench.TileEncodeBench.roundTrip", + "mode" : "avgt", + "threads" : 1, + "forks" : 1, + "jvm" : "/Library/Java/JavaVirtualMachines/jdk1.8.0_171.jdk/Contents/Home/jre/bin/java", + "jvmArgs" : [ + "-Xmx4g" + ], + "jdkVersion" : "1.8.0_171", + "vmName" : "Java HotSpot(TM) 64-Bit Server VM", + "vmVersion" : "25.171-b11", + "warmupIterations" : 8, + "warmupTime" : "10 s", + "warmupBatchSize" : 1, + "measurementIterations" : 5, + "measurementTime" : "10 s", + "measurementBatchSize" : 1, + "params" : { + "cellTypeName" : "int32", + "tileSize" : "64" + }, + "primaryMetric" : { + "score" : 18.333069815973175, + "scoreError" : 0.3341581136847717, + "scoreConfidence" : [ + 17.998911702288403, + 18.667227929657948 + ], + "scorePercentiles" : { + "0.0" : 18.226296598017225, + "50.0" : 18.31079228696827, + "90.0" : 18.426784947207096, + "95.0" : 18.426784947207096, + "99.0" : 18.426784947207096, + "99.9" : 18.426784947207096, + "99.99" : 18.426784947207096, + "99.999" : 18.426784947207096, + "99.9999" : 18.426784947207096, + "100.0" : 18.426784947207096 + }, + "scoreUnit" : "us/op", + "rawData" : [ + [ + 18.31079228696827, + 18.284332434448437, + 18.426784947207096, + 18.226296598017225, + 18.417142813224846 + ] + ] + }, + "secondaryMetrics" : { + } + }, + { + "jmhVersion" : "1.21", + "benchmark" : "astraea.spark.rasterframes.bench.TileEncodeBench.roundTrip", + "mode" : "avgt", + "threads" : 1, + "forks" : 1, + "jvm" : "/Library/Java/JavaVirtualMachines/jdk1.8.0_171.jdk/Contents/Home/jre/bin/java", + "jvmArgs" : [ + "-Xmx4g" + ], + "jdkVersion" : "1.8.0_171", + "vmName" : "Java HotSpot(TM) 64-Bit Server VM", + "vmVersion" : "25.171-b11", + "warmupIterations" : 8, + "warmupTime" : "10 s", + "warmupBatchSize" : 1, + "measurementIterations" : 5, + "measurementTime" : "10 s", + "measurementBatchSize" : 1, + "params" : { + "cellTypeName" : "int32", + "tileSize" : "512" + }, + "primaryMetric" : { + "score" : 1211.7797319906595, + "scoreError" : 49.26483544063887, + "scoreConfidence" : [ + 1162.5148965500207, + 1261.0445674312982 + ], + "scorePercentiles" : { + "0.0" : 1195.925568730576, + "50.0" : 1210.8324462729913, + "90.0" : 1226.1002647058824, + "95.0" : 1226.1002647058824, + "99.0" : 1226.1002647058824, + "99.9" : 1226.1002647058824, + "99.99" : 1226.1002647058824, + "99.999" : 1226.1002647058824, + "99.9999" : 1226.1002647058824, + "100.0" : 1226.1002647058824 + }, + "scoreUnit" : "us/op", + "rawData" : [ + [ + 1210.8324462729913, + 1195.925568730576, + 1222.877439173493, + 1226.1002647058824, + 1203.1629410703547 + ] + ] + }, + "secondaryMetrics" : { + } + }, + { + "jmhVersion" : "1.21", + "benchmark" : "astraea.spark.rasterframes.bench.TileEncodeBench.roundTrip", + "mode" : "avgt", + "threads" : 1, + "forks" : 1, + "jvm" : "/Library/Java/JavaVirtualMachines/jdk1.8.0_171.jdk/Contents/Home/jre/bin/java", + "jvmArgs" : [ + "-Xmx4g" + ], + "jdkVersion" : "1.8.0_171", + "vmName" : "Java HotSpot(TM) 64-Bit Server VM", + "vmVersion" : "25.171-b11", + "warmupIterations" : 8, + "warmupTime" : "10 s", + "warmupBatchSize" : 1, + "measurementIterations" : 5, + "measurementTime" : "10 s", + "measurementBatchSize" : 1, + "params" : { + "cellTypeName" : "float32", + "tileSize" : "64" + }, + "primaryMetric" : { + "score" : 20.739003038529034, + "scoreError" : 9.965728861183356, + "scoreConfidence" : [ + 10.773274177345678, + 30.704731899712392 + ], + "scorePercentiles" : { + "0.0" : 18.961133792379307, + "50.0" : 19.19435088399221, + "90.0" : 24.96128293474057, + "95.0" : 24.96128293474057, + "99.0" : 24.96128293474057, + "99.9" : 24.96128293474057, + "99.99" : 24.96128293474057, + "99.999" : 24.96128293474057, + "99.9999" : 24.96128293474057, + "100.0" : 24.96128293474057 + }, + "scoreUnit" : "us/op", + "rawData" : [ + [ + 19.061690043075515, + 19.19435088399221, + 18.961133792379307, + 21.516557538457565, + 24.96128293474057 + ] + ] + }, + "secondaryMetrics" : { + } + }, + { + "jmhVersion" : "1.21", + "benchmark" : "astraea.spark.rasterframes.bench.TileEncodeBench.roundTrip", + "mode" : "avgt", + "threads" : 1, + "forks" : 1, + "jvm" : "/Library/Java/JavaVirtualMachines/jdk1.8.0_171.jdk/Contents/Home/jre/bin/java", + "jvmArgs" : [ + "-Xmx4g" + ], + "jdkVersion" : "1.8.0_171", + "vmName" : "Java HotSpot(TM) 64-Bit Server VM", + "vmVersion" : "25.171-b11", + "warmupIterations" : 8, + "warmupTime" : "10 s", + "warmupBatchSize" : 1, + "measurementIterations" : 5, + "measurementTime" : "10 s", + "measurementBatchSize" : 1, + "params" : { + "cellTypeName" : "float32", + "tileSize" : "512" + }, + "primaryMetric" : { + "score" : 1224.738164769416, + "scoreError" : 126.4638940057822, + "scoreConfidence" : [ + 1098.274270763634, + 1351.202058775198 + ], + "scorePercentiles" : { + "0.0" : 1181.9325521805933, + "50.0" : 1229.8881532030002, + "90.0" : 1268.8545514967022, + "95.0" : 1268.8545514967022, + "99.0" : 1268.8545514967022, + "99.9" : 1268.8545514967022, + "99.99" : 1268.8545514967022, + "99.999" : 1268.8545514967022, + "99.9999" : 1268.8545514967022, + "100.0" : 1268.8545514967022 + }, + "scoreUnit" : "us/op", + "rawData" : [ + [ + 1268.8545514967022, + 1237.1361243971808, + 1205.8794425696035, + 1229.8881532030002, + 1181.9325521805933 + ] + ] + }, + "secondaryMetrics" : { + } + }, + { + "jmhVersion" : "1.21", + "benchmark" : "astraea.spark.rasterframes.bench.TileEncodeBench.roundTrip", + "mode" : "avgt", + "threads" : 1, + "forks" : 1, + "jvm" : "/Library/Java/JavaVirtualMachines/jdk1.8.0_171.jdk/Contents/Home/jre/bin/java", + "jvmArgs" : [ + "-Xmx4g" + ], + "jdkVersion" : "1.8.0_171", + "vmName" : "Java HotSpot(TM) 64-Bit Server VM", + "vmVersion" : "25.171-b11", + "warmupIterations" : 8, + "warmupTime" : "10 s", + "warmupBatchSize" : 1, + "measurementIterations" : 5, + "measurementTime" : "10 s", + "measurementBatchSize" : 1, + "params" : { + "cellTypeName" : "float64", + "tileSize" : "64" + }, + "primaryMetric" : { + "score" : 41.35522075338555, + "scoreError" : 1.5888146686882507, + "scoreConfidence" : [ + 39.7664060846973, + 42.9440354220738 + ], + "scorePercentiles" : { + "0.0" : 40.89423389193166, + "50.0" : 41.27828040884876, + "90.0" : 42.01334043026831, + "95.0" : 42.01334043026831, + "99.0" : 42.01334043026831, + "99.9" : 42.01334043026831, + "99.99" : 42.01334043026831, + "99.999" : 42.01334043026831, + "99.9999" : 42.01334043026831, + "100.0" : 42.01334043026831 + }, + "scoreUnit" : "us/op", + "rawData" : [ + [ + 40.89423389193166, + 41.189732083674706, + 42.01334043026831, + 41.27828040884876, + 41.40051695220435 + ] + ] + }, + "secondaryMetrics" : { + } + }, + { + "jmhVersion" : "1.21", + "benchmark" : "astraea.spark.rasterframes.bench.TileEncodeBench.roundTrip", + "mode" : "avgt", + "threads" : 1, + "forks" : 1, + "jvm" : "/Library/Java/JavaVirtualMachines/jdk1.8.0_171.jdk/Contents/Home/jre/bin/java", + "jvmArgs" : [ + "-Xmx4g" + ], + "jdkVersion" : "1.8.0_171", + "vmName" : "Java HotSpot(TM) 64-Bit Server VM", + "vmVersion" : "25.171-b11", + "warmupIterations" : 8, + "warmupTime" : "10 s", + "warmupBatchSize" : 1, + "measurementIterations" : 5, + "measurementTime" : "10 s", + "measurementBatchSize" : 1, + "params" : { + "cellTypeName" : "float64", + "tileSize" : "512" + }, + "primaryMetric" : { + "score" : 3401.0174112275795, + "scoreError" : 497.2925126951152, + "scoreConfidence" : [ + 2903.724898532464, + 3898.309923922695 + ], + "scorePercentiles" : { + "0.0" : 3234.1288173884936, + "50.0" : 3398.0858393887947, + "90.0" : 3570.2330531573316, + "95.0" : 3570.2330531573316, + "99.0" : 3570.2330531573316, + "99.9" : 3570.2330531573316, + "99.99" : 3570.2330531573316, + "99.999" : 3570.2330531573316, + "99.9999" : 3570.2330531573316, + "100.0" : 3570.2330531573316 + }, + "scoreUnit" : "us/op", + "rawData" : [ + [ + 3472.802212426241, + 3570.2330531573316, + 3398.0858393887947, + 3234.1288173884936, + 3329.837133777038 + ] + ] + }, + "secondaryMetrics" : { + } + }, + { + "jmhVersion" : "1.21", + "benchmark" : "astraea.spark.rasterframes.bench.TileEncodeBench.roundTrip", + "mode" : "avgt", + "threads" : 1, + "forks" : 1, + "jvm" : "/Library/Java/JavaVirtualMachines/jdk1.8.0_171.jdk/Contents/Home/jre/bin/java", + "jvmArgs" : [ + "-Xmx4g" + ], + "jdkVersion" : "1.8.0_171", + "vmName" : "Java HotSpot(TM) 64-Bit Server VM", + "vmVersion" : "25.171-b11", + "warmupIterations" : 8, + "warmupTime" : "10 s", + "warmupBatchSize" : 1, + "measurementIterations" : 5, + "measurementTime" : "10 s", + "measurementBatchSize" : 1, + "params" : { + "cellTypeName" : "rasterRef", + "tileSize" : "64" + }, + "primaryMetric" : { + "score" : 27.695579291790057, + "scoreError" : 1.727793688261407, + "scoreConfidence" : [ + 25.96778560352865, + 29.423372980051465 + ], + "scorePercentiles" : { + "0.0" : 27.169023039660257, + "50.0" : 27.78487872459196, + "90.0" : 28.259312501235915, + "95.0" : 28.259312501235915, + "99.0" : 28.259312501235915, + "99.9" : 28.259312501235915, + "99.99" : 28.259312501235915, + "99.999" : 28.259312501235915, + "99.9999" : 28.259312501235915, + "100.0" : 28.259312501235915 + }, + "scoreUnit" : "us/op", + "rawData" : [ + [ + 27.78487872459196, + 28.259312501235915, + 27.320646766780545, + 27.169023039660257, + 27.9440354266816 + ] + ] + }, + "secondaryMetrics" : { + } + }, + { + "jmhVersion" : "1.21", + "benchmark" : "astraea.spark.rasterframes.bench.TileEncodeBench.roundTrip", + "mode" : "avgt", + "threads" : 1, + "forks" : 1, + "jvm" : "/Library/Java/JavaVirtualMachines/jdk1.8.0_171.jdk/Contents/Home/jre/bin/java", + "jvmArgs" : [ + "-Xmx4g" + ], + "jdkVersion" : "1.8.0_171", + "vmName" : "Java HotSpot(TM) 64-Bit Server VM", + "vmVersion" : "25.171-b11", + "warmupIterations" : 8, + "warmupTime" : "10 s", + "warmupBatchSize" : 1, + "measurementIterations" : 5, + "measurementTime" : "10 s", + "measurementBatchSize" : 1, + "params" : { + "cellTypeName" : "rasterRef", + "tileSize" : "512" + }, + "primaryMetric" : { + "score" : 28.004859122985067, + "scoreError" : 0.7553941535592298, + "scoreConfidence" : [ + 27.249464969425837, + 28.760253276544297 + ], + "scorePercentiles" : { + "0.0" : 27.752906370782245, + "50.0" : 27.946522581888676, + "90.0" : 28.21079035273483, + "95.0" : 28.21079035273483, + "99.0" : 28.21079035273483, + "99.9" : 28.21079035273483, + "99.99" : 28.21079035273483, + "99.999" : 28.21079035273483, + "99.9999" : 28.21079035273483, + "100.0" : 28.21079035273483 + }, + "scoreUnit" : "us/op", + "rawData" : [ + [ + 28.21079035273483, + 27.917038521558283, + 28.19703778796129, + 27.946522581888676, + 27.752906370782245 + ] + ] + }, + "secondaryMetrics" : { + } + } +] + + diff --git a/bench/archive/jmh-results-20190207140524.json b/bench/archive/jmh-results-20190207140524.json new file mode 100644 index 000000000..a9b0ea424 --- /dev/null +++ b/bench/archive/jmh-results-20190207140524.json @@ -0,0 +1,118 @@ +[ + { + "jmhVersion" : "1.21", + "benchmark" : "astraea.spark.rasterframes.bench.BinaryTileOpBench.viaExpression", + "mode" : "avgt", + "threads" : 1, + "forks" : 1, + "jvm" : "/Library/Java/JavaVirtualMachines/jdk1.8.0_171.jdk/Contents/Home/jre/bin/java", + "jvmArgs" : [ + "-Xmx4g" + ], + "jdkVersion" : "1.8.0_171", + "vmName" : "Java HotSpot(TM) 64-Bit Server VM", + "vmVersion" : "25.171-b11", + "warmupIterations" : 8, + "warmupTime" : "10 s", + "warmupBatchSize" : 1, + "measurementIterations" : 5, + "measurementTime" : "10 s", + "measurementBatchSize" : 1, + "params" : { + "cellTypeName" : "uint16ud255", + "numTiles" : "100", + "tileSize" : "64" + }, + "primaryMetric" : { + "score" : 59.48713747203768, + "scoreError" : 1.682438644557827, + "scoreConfidence" : [ + 57.804698827479854, + 61.16957611659551 + ], + "scorePercentiles" : { + "0.0" : 59.08482080588235, + "50.0" : 59.30686591715976, + "90.0" : 60.0763132994012, + "95.0" : 60.0763132994012, + "99.0" : 60.0763132994012, + "99.9" : 60.0763132994012, + "99.99" : 60.0763132994012, + "99.999" : 60.0763132994012, + "99.9999" : 60.0763132994012, + "100.0" : 60.0763132994012 + }, + "scoreUnit" : "ms/op", + "rawData" : [ + [ + 59.30686591715976, + 59.08482080588235, + 59.81708820833333, + 60.0763132994012, + 59.150599129411766 + ] + ] + }, + "secondaryMetrics" : { + } + }, + { + "jmhVersion" : "1.21", + "benchmark" : "astraea.spark.rasterframes.bench.BinaryTileOpBench.viaUdf", + "mode" : "avgt", + "threads" : 1, + "forks" : 1, + "jvm" : "/Library/Java/JavaVirtualMachines/jdk1.8.0_171.jdk/Contents/Home/jre/bin/java", + "jvmArgs" : [ + "-Xmx4g" + ], + "jdkVersion" : "1.8.0_171", + "vmName" : "Java HotSpot(TM) 64-Bit Server VM", + "vmVersion" : "25.171-b11", + "warmupIterations" : 8, + "warmupTime" : "10 s", + "warmupBatchSize" : 1, + "measurementIterations" : 5, + "measurementTime" : "10 s", + "measurementBatchSize" : 1, + "params" : { + "cellTypeName" : "uint16ud255", + "numTiles" : "100", + "tileSize" : "64" + }, + "primaryMetric" : { + "score" : 56.46931398875338, + "scoreError" : 2.4604658224787643, + "scoreConfidence" : [ + 54.00884816627461, + 58.929779811232144 + ], + "scorePercentiles" : { + "0.0" : 55.630841994444445, + "50.0" : 56.575462519774014, + "90.0" : 57.347593422857145, + "95.0" : 57.347593422857145, + "99.0" : 57.347593422857145, + "99.9" : 57.347593422857145, + "99.99" : 57.347593422857145, + "99.999" : 57.347593422857145, + "99.9999" : 57.347593422857145, + "100.0" : 57.347593422857145 + }, + "scoreUnit" : "ms/op", + "rawData" : [ + [ + 57.347593422857145, + 55.630841994444445, + 56.13477540782123, + 56.65789659887005, + 56.575462519774014 + ] + ] + }, + "secondaryMetrics" : { + } + } +] + + diff --git a/bench/archive/jmh-results-20190224115207.json b/bench/archive/jmh-results-20190224115207.json new file mode 100644 index 000000000..7aed87d40 --- /dev/null +++ b/bench/archive/jmh-results-20190224115207.json @@ -0,0 +1,118 @@ +[ + { + "jmhVersion" : "1.21", + "benchmark" : "astraea.spark.rasterframes.bench.BinaryTileOpBench.viaExpression", + "mode" : "avgt", + "threads" : 1, + "forks" : 1, + "jvm" : "/Library/Java/JavaVirtualMachines/jdk1.8.0_171.jdk/Contents/Home/jre/bin/java", + "jvmArgs" : [ + "-Xmx4g" + ], + "jdkVersion" : "1.8.0_171", + "vmName" : "Java HotSpot(TM) 64-Bit Server VM", + "vmVersion" : "25.171-b11", + "warmupIterations" : 8, + "warmupTime" : "10 s", + "warmupBatchSize" : 1, + "measurementIterations" : 5, + "measurementTime" : "10 s", + "measurementBatchSize" : 1, + "params" : { + "cellTypeName" : "uint16ud255", + "numTiles" : "100", + "tileSize" : "64" + }, + "primaryMetric" : { + "score" : 60.24139353949952, + "scoreError" : 3.2516181730645295, + "scoreConfidence" : [ + 56.98977536643499, + 63.49301171256405 + ], + "scorePercentiles" : { + "0.0" : 59.459361017751476, + "50.0" : 60.078545143712574, + "90.0" : 61.480693024539875, + "95.0" : 61.480693024539875, + "99.0" : 61.480693024539875, + "99.9" : 61.480693024539875, + "99.99" : 61.480693024539875, + "99.999" : 61.480693024539875, + "99.9999" : 61.480693024539875, + "100.0" : 61.480693024539875 + }, + "scoreUnit" : "ms/op", + "rawData" : [ + [ + 59.459361017751476, + 60.65764786060606, + 60.078545143712574, + 59.530720650887574, + 61.480693024539875 + ] + ] + }, + "secondaryMetrics" : { + } + }, + { + "jmhVersion" : "1.21", + "benchmark" : "astraea.spark.rasterframes.bench.BinaryTileOpBench.viaUdf", + "mode" : "avgt", + "threads" : 1, + "forks" : 1, + "jvm" : "/Library/Java/JavaVirtualMachines/jdk1.8.0_171.jdk/Contents/Home/jre/bin/java", + "jvmArgs" : [ + "-Xmx4g" + ], + "jdkVersion" : "1.8.0_171", + "vmName" : "Java HotSpot(TM) 64-Bit Server VM", + "vmVersion" : "25.171-b11", + "warmupIterations" : 8, + "warmupTime" : "10 s", + "warmupBatchSize" : 1, + "measurementIterations" : 5, + "measurementTime" : "10 s", + "measurementBatchSize" : 1, + "params" : { + "cellTypeName" : "uint16ud255", + "numTiles" : "100", + "tileSize" : "64" + }, + "primaryMetric" : { + "score" : 58.69963037452669, + "scoreError" : 2.0485509148735965, + "scoreConfidence" : [ + 56.6510794596531, + 60.748181289400286 + ], + "scorePercentiles" : { + "0.0" : 58.22195943604651, + "50.0" : 58.52202024561404, + "90.0" : 59.53467150595238, + "95.0" : 59.53467150595238, + "99.0" : 59.53467150595238, + "99.9" : 59.53467150595238, + "99.99" : 59.53467150595238, + "99.999" : 59.53467150595238, + "99.9999" : 59.53467150595238, + "100.0" : 59.53467150595238 + }, + "scoreUnit" : "ms/op", + "rawData" : [ + [ + 58.891654411764705, + 58.52202024561404, + 58.22195943604651, + 58.327846273255815, + 59.53467150595238 + ] + ] + }, + "secondaryMetrics" : { + } + } +] + + diff --git a/bench/src/main/scala/astraea/spark/rasterframes/bench/BinaryTileOpBench.scala b/bench/src/main/scala/astraea/spark/rasterframes/bench/BinaryTileOpBench.scala new file mode 100644 index 000000000..133d93356 --- /dev/null +++ b/bench/src/main/scala/astraea/spark/rasterframes/bench/BinaryTileOpBench.scala @@ -0,0 +1,67 @@ +/* + * This software is licensed under the Apache 2 license, quoted below. + * + * Copyright 2019 Astraea, Inc. + * + * Licensed under the Apache License, Version 2.0 (the "License"); you may not + * use this file except in compliance with the License. You may obtain a copy of + * the License at + * + * [http://www.apache.org/licenses/LICENSE-2.0] + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT + * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the + * License for the specific language governing permissions and limitations under + * the License. + * + * SPDX-License-Identifier: Apache-2.0 + * + */ + +package astraea.spark.rasterframes.bench +import java.util.concurrent.TimeUnit + +import astraea.spark.rasterframes.expressions.localops._ +import astraea.spark.rasterframes._ +import geotrellis.raster.Tile +import geotrellis.raster.mapalgebra.{local => gt} +import org.apache.spark.sql._ +import org.apache.spark.sql.functions._ +import org.openjdk.jmh.annotations._ +@BenchmarkMode(Array(Mode.AverageTime)) +@State(Scope.Benchmark) +@OutputTimeUnit(TimeUnit.MILLISECONDS) +class BinaryTileOpBench extends SparkEnv { + import spark.implicits._ + + @Param(Array("uint16ud255")) + var cellTypeName: String = _ + + @Param(Array("64")) + var tileSize: Int = _ + + @Param(Array("100")) + var numTiles: Int = _ + + @transient + var tiles: DataFrame = _ + + val localAddUDF = udf((left: Tile, right: Tile) => gt.Add(left, right)) + + @Setup(Level.Trial) + def setupData(): Unit = { + tiles = Seq.fill(numTiles)((randomTile(tileSize, tileSize, cellTypeName), randomTile(tileSize, tileSize, cellTypeName))) + .toDF("left", "right").repartition(10) + } + + @Benchmark + def viaExpression(): Array[Tile] = { + tiles.select(Add($"left", $"right")).collect() + } + + @Benchmark + def viaUdf(): Array[Tile] = { + tiles.select(localAddUDF($"left", $"right").as[Tile]).collect() + } +} diff --git a/bench/src/main/scala/astraea/spark/rasterframes/bench/RasterRefBench.scala b/bench/src/main/scala/astraea/spark/rasterframes/bench/RasterRefBench.scala index c5fc316d2..c68c826e8 100644 --- a/bench/src/main/scala/astraea/spark/rasterframes/bench/RasterRefBench.scala +++ b/bench/src/main/scala/astraea/spark/rasterframes/bench/RasterRefBench.scala @@ -26,7 +26,7 @@ import java.util.concurrent.TimeUnit import astraea.spark.rasterframes import astraea.spark.rasterframes._ -import astraea.spark.rasterframes.expressions.RasterSourceToTiles +import astraea.spark.rasterframes.expressions.transformers.RasterSourceToTiles import astraea.spark.rasterframes.ref.RasterSource import astraea.spark.rasterframes.ref.RasterSource.ReadCallback import com.typesafe.scalalogging.LazyLogging diff --git a/bench/src/main/scala/astraea/spark/rasterframes/bench/StatsComputeBench.scala b/bench/src/main/scala/astraea/spark/rasterframes/bench/StatsComputeBench.scala index 721f71e63..c9aa7eef4 100644 --- a/bench/src/main/scala/astraea/spark/rasterframes/bench/StatsComputeBench.scala +++ b/bench/src/main/scala/astraea/spark/rasterframes/bench/StatsComputeBench.scala @@ -22,6 +22,7 @@ package astraea.spark.rasterframes.bench import java.util.concurrent.TimeUnit import astraea.spark.rasterframes._ +import astraea.spark.rasterframes.stats.CellHistogram import org.apache.spark.sql._ import org.openjdk.jmh.annotations._ @@ -54,25 +55,28 @@ class StatsComputeBench extends SparkEnv { .toDF("tile").repartition(10) } - @Benchmark - def computeStats() = { - tiles.select(agg_stats($"tile")).collect() - } +// @Benchmark +// def computeStats(): Array[CellStatistics] = { +// tiles.select(agg_stats($"tile")).collect() +// } @Benchmark - def extractMean() = { - tiles.select(agg_stats($"tile").getField("mean")).map(_.getDouble(0)).collect() + def computeHistogram(): Array[CellHistogram] = { + tiles.select(agg_approx_histogram($"tile")).collect() } - @Benchmark - def directMean() = { - tiles.repartition(10).select(agg_mean($"tile")).collect() - } +// @Benchmark +// def extractMean(): Array[Double] = { +// tiles.select(agg_stats($"tile").getField("mean")).map(_.getDouble(0)).collect() +// } +// +// @Benchmark +// def directMean(): Array[Double] = { +// tiles.repartition(10).select(agg_mean($"tile")).collect() +// } // @Benchmark // def computeCounts() = { // tiles.toDF("tile").select(data_cells($"tile") as "counts").agg(sum($"counts")).collect() // } - - } diff --git a/bench/src/main/scala/astraea/spark/rasterframes/bench/TileExplodeBench.scala b/bench/src/main/scala/astraea/spark/rasterframes/bench/TileExplodeBench.scala index 11069d635..ebd4f169c 100644 --- a/bench/src/main/scala/astraea/spark/rasterframes/bench/TileExplodeBench.scala +++ b/bench/src/main/scala/astraea/spark/rasterframes/bench/TileExplodeBench.scala @@ -56,7 +56,7 @@ class TileExplodeBench extends SparkEnv { @Benchmark def arrayExplode() = { - tiles.select(posexplode(tile_to_array[Double]($"tile"))).count() + tiles.select(posexplode(tile_to_array_double($"tile"))).count() } @Benchmark diff --git a/build.sbt b/build.sbt index ecf8948d5..05aed4e8d 100644 --- a/build.sbt +++ b/build.sbt @@ -12,7 +12,12 @@ lazy val deployment = project .dependsOn(root) .disablePlugins(SparkPackagePlugin) +lazy val IntegrationTest = config("it") extend Test + lazy val core = project + .configs(IntegrationTest) + .settings(inConfig(IntegrationTest)(Defaults.testSettings)) + .settings(Defaults.itSettings) .disablePlugins(SparkPackagePlugin) lazy val pyrasterframes = project diff --git a/core/src/it/resources/log4j.properties b/core/src/it/resources/log4j.properties new file mode 100644 index 000000000..378ae8e61 --- /dev/null +++ b/core/src/it/resources/log4j.properties @@ -0,0 +1,46 @@ +# +# Licensed to the Apache Software Foundation (ASF) under one or more +# contributor license agreements. See the NOTICE file distributed with +# this work for additional information regarding copyright ownership. +# The ASF licenses this file to You under the Apache License, Version 2.0 +# (the "License"); you may not use this file except in compliance with +# the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# + +# Set everything to be logged to the console +log4j.rootCategory=WARN, console +log4j.appender.console=org.apache.log4j.ConsoleAppender +log4j.appender.console.target=System.err +log4j.appender.console.layout=org.apache.log4j.PatternLayout +log4j.appender.console.layout.ConversionPattern=%d{yy/MM/dd HH:mm:ss} %p %c{1}: %m%n + +# Set the default spark-shell log level to WARN. When running the spark-shell, the +# log level for this class is used to overwrite the root logger's log level, so that +# the user can have different defaults for the shell and regular Spark apps. +log4j.logger.org.apache.spark.repl.Main=WARN + + +log4j.logger.org.apache=ERROR +log4j.logger.com.amazonaws=WARN +log4j.logger.geotrellis=INFO + +# Settings to quiet third party logs that are too verbose +log4j.logger.org.spark_project.jetty=WARN +log4j.logger.org.spark_project.jetty.util.component.AbstractLifeCycle=ERROR +log4j.logger.org.apache.spark.repl.SparkIMain$exprTyper=INFO +log4j.logger.org.apache.spark.repl.SparkILoop$SparkILoopInterpreter=INFO +log4j.logger.astraea.spark.rasterframes=DEBUG +log4j.logger.astraea.spark.rasterframes.ref=TRACE +log4j.logger.org.apache.parquet.hadoop.ParquetRecordReader=OFF + +# SPARK-9183: Settings to avoid annoying messages when looking up nonexistent UDFs in SparkSQL with Hive support +log4j.logger.org.apache.hadoop.hive.metastore.RetryingHMSHandler=FATAL +log4j.logger.org.apache.hadoop.hive.ql.exec.FunctionRegistry=ERROR diff --git a/core/src/it/scala/astraea/spark/rasterframes/ref/RasterSourceIT.scala b/core/src/it/scala/astraea/spark/rasterframes/ref/RasterSourceIT.scala new file mode 100644 index 000000000..6f9069183 --- /dev/null +++ b/core/src/it/scala/astraea/spark/rasterframes/ref/RasterSourceIT.scala @@ -0,0 +1,61 @@ +/* + * This software is licensed under the Apache 2 license, quoted below. + * + * Copyright 2019 Astraea, Inc. + * + * Licensed under the Apache License, Version 2.0 (the "License"); you may not + * use this file except in compliance with the License. You may obtain a copy of + * the License at + * + * [http://www.apache.org/licenses/LICENSE-2.0] + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT + * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the + * License for the specific language governing permissions and limitations under + * the License. + * + * SPDX-License-Identifier: Apache-2.0 + * + */ + +package astraea.spark.rasterframes.ref + +import java.net.URI + +import astraea.spark.rasterframes.TestEnvironment.ReadMonitor +import astraea.spark.rasterframes.ref.RasterSource.FileGeoTiffRasterSource +import astraea.spark.rasterframes.{TestData, TestEnvironment} +import geotrellis.raster.io.geotiff.GeoTiff +import geotrellis.vector.Extent +import org.apache.spark.sql.rf.RasterSourceUDT + +/** + * + * + * @since 8/22/18 + */ +class RasterSourceIT extends TestEnvironment with TestData { + def sub(e: Extent) = { + val c = e.center + val w = e.width + val h = e.height + Extent(c.x, c.y, c.x + w * 0.1, c.y + h * 0.1) + } + + describe("RasterSource.readAll") { + it("should return consistently ordered tiles across bands for a given scene") { + // These specific scenes exhibit the problem where we see different subtile segment ordering across the bands of a given scene. + val rURI = new URI("https://s3-us-west-2.amazonaws.com/landsat-pds/c1/L8/016/034/LC08_L1TP_016034_20181003_20181003_01_RT/LC08_L1TP_016034_20181003_20181003_01_RT_B4.TIF") + val bURI = new URI("https://s3-us-west-2.amazonaws.com/landsat-pds/c1/L8/016/034/LC08_L1TP_016034_20181003_20181003_01_RT/LC08_L1TP_016034_20181003_20181003_01_RT_B2.TIF") + + val red = RasterSource(rURI).readAll().left.get + val blue = RasterSource(bURI).readAll().left.get + + red should not be empty + red.size should equal(blue.size) + + red.map(_.dimensions) should contain theSameElementsAs blue.map(_.dimensions) + } + } +} diff --git a/core/src/main/scala/astraea/spark/rasterframes/RasterFunctions.scala b/core/src/main/scala/astraea/spark/rasterframes/RasterFunctions.scala index e97b86267..ff08dd44c 100644 --- a/core/src/main/scala/astraea/spark/rasterframes/RasterFunctions.scala +++ b/core/src/main/scala/astraea/spark/rasterframes/RasterFunctions.scala @@ -19,11 +19,15 @@ package astraea.spark.rasterframes -import astraea.spark.rasterframes.encoders.SparkDefaultEncoders -import astraea.spark.rasterframes.expressions.ReprojectGeometry -import astraea.spark.rasterframes.functions.{CellCountAggregate, CellMeanAggregate, CellStatsAggregate} +import astraea.spark.rasterframes.expressions.TileAssembler +import astraea.spark.rasterframes.expressions.accessors._ +import astraea.spark.rasterframes.expressions.aggstats._ +import astraea.spark.rasterframes.expressions.generators._ +import astraea.spark.rasterframes.expressions.localops._ +import astraea.spark.rasterframes.expressions.tilestats._ +import astraea.spark.rasterframes.expressions.transformers._ import astraea.spark.rasterframes.stats.{CellHistogram, CellStatistics} -import astraea.spark.rasterframes.{expressions => E, functions => F} +import astraea.spark.rasterframes.{functions => F} import com.vividsolutions.jts.geom.{Envelope, Geometry} import geotrellis.proj4.CRS import geotrellis.raster.mapalgebra.local.LocalTileBinaryOp @@ -32,16 +36,14 @@ import org.apache.spark.annotation.Experimental import org.apache.spark.sql._ import org.apache.spark.sql.functions._ -import scala.reflect.runtime.universe._ - /** * UDFs for working with Tiles in Spark DataFrames. * * @since 4/3/17 */ trait RasterFunctions { - import SparkDefaultEncoders._ import util._ + import PrimitiveEncoders._ // format: off /** Create a row for each cell in Tile. */ @@ -49,23 +51,25 @@ trait RasterFunctions { /** Create a row for each cell in Tile with random sampling and optional seed. */ def explode_tiles_sample(sampleFraction: Double, seed: Option[Long], cols: Column*): Column = - E.ExplodeTiles(sampleFraction, seed, cols) + ExplodeTiles(sampleFraction, seed, cols) /** Create a row for each cell in Tile with random sampling (no seed). */ def explode_tiles_sample(sampleFraction: Double, cols: Column*): Column = - E.ExplodeTiles(sampleFraction, None, cols) + ExplodeTiles(sampleFraction, None, cols) /** Query the number of (cols, rows) in a Tile. */ - def tile_dimensions(col: Column): Column = E.GetDimensions(col) + def tile_dimensions(col: Column): Column = GetDimensions(col) /** Extracts the bounding box of a geometry as a JTS envelope. */ - def envelope(col: Column): TypedColumn[Any, Envelope] = E.GetEnvelope(col) + def envelope(col: Column): TypedColumn[Any, Envelope] = GetEnvelope(col) - /** Flattens Tile into an array. A numeric type parameter is required. */ - @Experimental - def tile_to_array[T: HasCellType: TypeTag](col: Column): TypedColumn[Any, Array[T]] = withAlias("tile_to_array", col)( - udf[Array[T], Tile](F.tileToArray).apply(col) - ).as[Array[T]] + /** Flattens Tile into a double array. */ + def tile_to_array_double(col: Column): TypedColumn[Any, Array[Double]] = + TileToArrayDouble(col) + + /** Flattens Tile into an integer array. */ + def tile_to_array_int(col: Column): TypedColumn[Any, Array[Double]] = + TileToArrayDouble(col) @Experimental /** Convert array in `arrayCol` into a Tile of dimensions `cols` and `rows`*/ @@ -75,204 +79,125 @@ trait RasterFunctions { /** Create a Tile from a column of cell data with location indexes and preform cell conversion. */ def assemble_tile(columnIndex: Column, rowIndex: Column, cellData: Column, tileCols: Int, tileRows: Int, ct: CellType): TypedColumn[Any, Tile] = - convert_cell_type(F.TileAssembler(columnIndex, rowIndex, cellData, lit(tileCols), lit(tileRows)), ct).as(cellData.columnName).as[Tile] + convert_cell_type(TileAssembler(columnIndex, rowIndex, cellData, lit(tileCols), lit(tileRows)), ct).as(cellData.columnName).as[Tile](singlebandTileEncoder) /** Create a Tile from a column of cell data with location indexes. */ def assemble_tile(columnIndex: Column, rowIndex: Column, cellData: Column, tileCols: Column, tileRows: Column): TypedColumn[Any, Tile] = - F.TileAssembler(columnIndex, rowIndex, cellData, tileCols, tileRows) + TileAssembler(columnIndex, rowIndex, cellData, tileCols, tileRows) /** Extract the Tile's cell type */ - def cell_type(col: Column): TypedColumn[Any, CellType] = E.GetCellType(col) + def cell_type(col: Column): TypedColumn[Any, CellType] = GetCellType(col) /** Change the Tile's cell type */ def convert_cell_type(col: Column, cellType: CellType): TypedColumn[Any, Tile] = - E.SetCellType(col, cellType) + SetCellType(col, cellType) /** Change the Tile's cell type */ def convert_cell_type(col: Column, cellTypeName: String): TypedColumn[Any, Tile] = - E.SetCellType(col, cellTypeName) + SetCellType(col, cellTypeName) /** Convert a bounding box structure to a Geometry type. Intented to support multiple schemas. */ - def bounds_geometry(bounds: Column): TypedColumn[Any, Geometry] = E.BoundsToGeometry(bounds) + def bounds_geometry(bounds: Column): TypedColumn[Any, Geometry] = BoundsToGeometry(bounds) /** Assign a `NoData` value to the Tiles. */ - def with_no_data(col: Column, nodata: Double) = withAlias("with_no_data", col)( + def with_no_data(col: Column, nodata: Double): TypedColumn[Any, Tile] = withAlias("with_no_data", col)( udf[Tile, Tile](F.withNoData(nodata)).apply(col) ).as[Tile] /** Compute the full column aggregate floating point histogram. */ - def agg_histogram(col: Column): TypedColumn[Any, CellHistogram] = - withAlias("histogram", col)( - F.aggHistogram(col) - ).as[CellHistogram] + def agg_approx_histogram(col: Column): TypedColumn[Any, CellHistogram] = + HistogramAggregate(col) /** Compute the full column aggregate floating point statistics. */ - def agg_stats(col: Column): TypedColumn[Any, CellStatistics] = withAlias("agg_stats", col)( - F.aggStats(col) - ).as[CellStatistics] + def agg_stats(col: Column): TypedColumn[Any, CellStatistics] = + CellStatsAggregate(col) /** Computes the column aggregate mean. */ def agg_mean(col: Column) = CellMeanAggregate(col) /** Computes the number of non-NoData cells in a column. */ - def agg_data_cells(col: Column) = CellCountAggregate(true, col) + def agg_data_cells(col: Column): TypedColumn[Any, Long] = CellCountAggregate.DataCells(col) /** Computes the number of NoData cells in a column. */ - def agg_no_data_cells(col: Column) = CellCountAggregate(false, col) + def agg_no_data_cells(col: Column): TypedColumn[Any, Long] = CellCountAggregate.NoDataCells(col) /** Compute the Tile-wise mean */ def tile_mean(col: Column): TypedColumn[Any, Double] = - withAlias("tile_mean", col)( - udf[Double, Tile](F.tileMean).apply(col) - ).as[Double] + TileMean(col) /** Compute the Tile-wise sum */ def tile_sum(col: Column): TypedColumn[Any, Double] = - withAlias("tile_sum", col)( - udf[Double, Tile](F.tileSum).apply(col) - ).as[Double] + Sum(col) /** Compute the minimum cell value in tile. */ def tile_min(col: Column): TypedColumn[Any, Double] = - withAlias("tile_min", col)( - udf[Double, Tile](F.tileMin).apply(col) - ).as[Double] + TileMin(col) /** Compute the maximum cell value in tile. */ def tile_max(col: Column): TypedColumn[Any, Double] = - withAlias("tile_max", col)( - udf[Double, Tile](F.tileMax).apply(col) - ).as[Double] + TileMax(col) /** Compute TileHistogram of Tile values. */ def tile_histogram(col: Column): TypedColumn[Any, CellHistogram] = - withAlias("tile_histogram", col)( - udf[CellHistogram, Tile](F.tileHistogram).apply(col) - ).as[CellHistogram] + TileHistogram(col) /** Compute statistics of Tile values. */ def tile_stats(col: Column): TypedColumn[Any, CellStatistics] = - withAlias("tile_stats", col)( - udf[CellStatistics, Tile](F.tileStats).apply(col) - ).as[CellStatistics] + TileStats(col) /** Counts the number of non-NoData cells per Tile. */ def data_cells(tile: Column): TypedColumn[Any, Long] = - withAlias("data_cells", tile)( - udf(F.dataCells).apply(tile) - ).as[Long] + DataCells(tile) /** Counts the number of NoData cells per Tile. */ def no_data_cells(tile: Column): TypedColumn[Any, Long] = - withAlias("no_data_cells", tile)( - udf(F.noDataCells).apply(tile) - ).as[Long] - + NoDataCells(tile) def is_no_data_tile(tile: Column): TypedColumn[Any, Boolean] = - withAlias("is_no_data_tile", tile)( - udf(F.isNoDataTile).apply(tile) - ).as[Boolean] + IsNoDataTile(tile) /** Compute cell-local aggregate descriptive statistics for a column of Tiles. */ - def local_agg_stats(col: Column): Column = - withAlias("local_agg_stats", col)( - F.localAggStats(col) - ) + def agg_local_stats(col: Column) = + LocalStatsAggregate(col) /** Compute the cell-wise/local max operation between Tiles in a column. */ - def local_agg_max(col: Column): TypedColumn[Any, Tile] = - withAlias("local_agg_max", col)( - F.localAggMax(col) - ).as[Tile] + def agg_local_max(col: Column): TypedColumn[Any, Tile] = LocalTileOpAggregate.LocalMaxUDAF(col) /** Compute the cellwise/local min operation between Tiles in a column. */ - def local_agg_min(col: Column): TypedColumn[Any, Tile] = - withAlias("local_agg_min", col)( - F.localAggMin(col) - ).as[Tile] + def agg_local_min(col: Column): TypedColumn[Any, Tile] = LocalTileOpAggregate.LocalMinUDAF(col) /** Compute the cellwise/local mean operation between Tiles in a column. */ - def local_agg_mean(col: Column): TypedColumn[Any, Tile] = - withAlias("local_agg_mean", col)( - F.localAggMean(col) - ).as[Tile] + def agg_local_mean(col: Column): TypedColumn[Any, Tile] = LocalMeanAggregate(col) /** Compute the cellwise/local count of non-NoData cells for all Tiles in a column. */ - def local_agg_data_cells(col: Column): TypedColumn[Any, Tile] = - withAlias("local_agg_data_cells", col)( - F.localAggCount(col) - ).as[Tile] + def agg_local_data_cells(col: Column): TypedColumn[Any, Tile] = LocalCountAggregate.LocalDataCellsUDAF(col) /** Compute the cellwise/local count of NoData cells for all Tiles in a column. */ - def local_agg_no_data_cells(col: Column): TypedColumn[Any, Tile] = - withAlias("local_agg_no_data_cells", col)( - F.localAggNodataCount(col) - ).as[Tile] - - /** Cellwise addition between two Tiles. */ - def local_add(left: Column, right: Column): TypedColumn[Any, Tile] = - withAlias("local_add", left, right)( - udf(F.localAdd).apply(left, right) - ).as[Tile] + def agg_local_no_data_cells(col: Column): TypedColumn[Any, Tile] = LocalCountAggregate.LocalNoDataCellsUDAF(col) - /** Cellwise addition of a scalar to a tile. */ - def local_add_scalar[T: Numeric](tileCol: Column, value: T): TypedColumn[Any, Tile] = { - val f = value match { - case i: Int => F.localAddScalarInt(_: Tile, i) - case d: Double => F.localAddScalar(_: Tile, d) - } + /** Cellwise addition between two Tiles or Tile and scalar column. */ + def local_add(left: Column, right: Column): TypedColumn[Any, Tile] = Add(left, right) - udf(f).apply(tileCol).as(s"local_add_scalar($tileCol, $value)").as[Tile] - } + /** Cellwise addition of a scalar value to a tile. */ + def local_add[T: Numeric](tileCol: Column, value: T): TypedColumn[Any, Tile] = Add(tileCol, value) /** Cellwise subtraction between two Tiles. */ - def local_subtract(left: Column, right: Column): TypedColumn[Any, Tile] = - withAlias("local_subtract", left, right)( - udf(F.localSubtract).apply(left, right) - ).as[Tile] - - /** Cellwise subtraction of a scalar from a tile. */ - def local_subtract_scalar[T: Numeric](tileCol: Column, value: T): TypedColumn[Any, Tile] = { - val f = value match { - case i: Int => F.localSubtractScalarInt(_: Tile, i) - case d: Double => F.localSubtractScalar(_: Tile, d) - } + def local_subtract(left: Column, right: Column): TypedColumn[Any, Tile] = Subtract(left, right) - udf(f).apply(tileCol).as(s"local_subtract_scalar($tileCol, $value)").as[Tile] - } + /** Cellwise subtraction of a scalar value from a tile. */ + def local_subtract[T: Numeric](tileCol: Column, value: T): TypedColumn[Any, Tile] = Subtract(tileCol, value) /** Cellwise multiplication between two Tiles. */ - def local_multiply(left: Column, right: Column): TypedColumn[Any, Tile] = - withAlias("local_multiply", left, right)( - udf(F.localMultiply).apply(left, right) - ).as[Tile] + def local_multiply(left: Column, right: Column): TypedColumn[Any, Tile] = Multiply(left, right) - /** Cellwise multiplication of a tile by a scalar. */ - def local_multiply_scalar[T: Numeric](tileCol: Column, value: T): TypedColumn[Any, Tile] = { - val f = value match { - case i: Int => F.localMultiplyScalarInt(_: Tile, i) - case d: Double => F.localMultiplyScalar(_: Tile, d) - } - - udf(f).apply(tileCol).as(s"local_multiply_scalar($tileCol, $value)").as[Tile] - } + /** Cellwise multiplication of a tile by a scalar value. */ + def local_multiply[T: Numeric](tileCol: Column, value: T): TypedColumn[Any, Tile] = Multiply(tileCol, value) /** Cellwise division between two Tiles. */ - def local_divide(left: Column, right: Column): TypedColumn[Any, Tile] = - withAlias("local_divide", left, right)( - udf(F.localDivide).apply(left, right) - ).as[Tile] - - /** Cellwise division of a tile by a scalar. */ - def local_divide_scalar[T: Numeric](tileCol: Column, value: T): TypedColumn[Any, Tile] = { - val f = value match { - case i: Int => F.localDivideScalarInt(_: Tile, i) - case d: Double => F.localDivideScalar(_: Tile, d) - } + def local_divide(left: Column, right: Column): TypedColumn[Any, Tile] = Divide(left, right) - udf(f).apply(tileCol).as(s"local_divide_scalar($tileCol, $value)").as[Tile] - } + /** Cellwise division of a tile by a scalar value. */ + def local_divide[T: Numeric](tileCol: Column, value: T): TypedColumn[Any, Tile] = Divide(tileCol, value) /** Perform an arbitrary GeoTrellis `LocalTileBinaryOp` between two Tile columns. */ def local_algebra(op: LocalTileBinaryOp, left: Column, right: Column): @@ -282,10 +207,8 @@ trait RasterFunctions { ).as[Tile] /** Compute the normalized difference of two tile columns */ - def normalized_difference(left: Column, right: Column): TypedColumn[Any, Tile] = - withAlias("normalized_difference", left, right)( - udf(F.normalizedDifference).apply(left, right) - ).as[Tile] + def normalized_difference(left: Column, right: Column) = + NormalizedDifference(left, right) /** Constructor for constant tile column */ def make_constant_tile(value: Number, cols: Int, rows: Int, cellType: String): TypedColumn[Any, Tile] = @@ -301,21 +224,15 @@ trait RasterFunctions { /** Where the mask tile contains NODATA, replace values in the source tile with NODATA */ def mask(sourceTile: Column, maskTile: Column): TypedColumn[Any, Tile] = - withAlias("mask", sourceTile, maskTile)( - udf(F.mask).apply(sourceTile, maskTile) - ).as[Tile] + Mask.MaskByDefined(sourceTile, maskTile) /** Where the mask tile equals the mask value, replace values in the source tile with NODATA */ def mask_by_value(sourceTile: Column, maskTile: Column, maskValue: Column): TypedColumn[Any, Tile] = - withAlias("mask_by_value", sourceTile, maskTile, maskValue)( - udf(F.maskByValue).apply(sourceTile, maskTile, maskValue) - ).as[Tile] + Mask.MaskByValue(sourceTile, maskTile, maskValue) /** Where the mask tile DOES NOT contain NODATA, replace values in the source tile with NODATA */ def inverse_mask(sourceTile: Column, maskTile: Column): TypedColumn[Any, Tile] = - withAlias("inverse_mask", sourceTile, maskTile)( - udf(F.inverseMask).apply(sourceTile, maskTile) - ).as[Tile] + Mask.InverseMaskByDefined(sourceTile, maskTile) /** Create a tile where cells in the grid defined by cols, rows, and bounds are filled with the given value. */ def rasterize(geometry: Column, bounds: Column, value: Column, cols: Int, rows: Int): TypedColumn[Any, Tile] = @@ -335,101 +252,102 @@ trait RasterFunctions { def reproject_geometry(sourceGeom: Column, srcCRS: CRS, dstCRS: CRS): TypedColumn[Any, Geometry] = ReprojectGeometry(sourceGeom, srcCRS, dstCRS) - /** Render Tile as ASCII string for debugging purposes. */ - @Experimental + /** Render Tile as ASCII string, for debugging purposes. */ def render_ascii(col: Column): TypedColumn[Any, String] = - withAlias("render_ascii", col)( - udf[String, Tile](F.renderAscii).apply(col) - ).as[String] + DebugRender.RenderAscii(col) + + /** Render Tile cell values as numeric values, for debugging purposes. */ + def render_matrix(col: Column): TypedColumn[Any, String] = + DebugRender.RenderMatrix(col) /** Cellwise less than value comparison between two tiles. */ def local_less(left: Column, right: Column): TypedColumn[Any, Tile] = - withAlias("local_less", left, right)( - udf(F.localLess).apply(left, right) - ).as[Tile] - + Less(left, right) /** Cellwise less than value comparison between a tile and a scalar. */ - def local_less_scalar[T: Numeric](tileCol: Column, value: T): TypedColumn[Any, Tile] = { - val f = value match{ - case i: Int ⇒ F.localLessScalarInt(_: Tile, i) - case d: Double ⇒ F.localLessScalar(_: Tile, d) - } - udf(f).apply(tileCol).as(s"local_less_scalar($tileCol, $value)").as[Tile] - } + def local_less[T: Numeric](tileCol: Column, value: T): TypedColumn[Any, Tile] = + Less(tileCol, value) /** Cellwise less than or equal to value comparison between a tile and a scalar. */ def local_less_equal(left: Column, right: Column): TypedColumn[Any, Tile] = - withAlias("local_less_equal", left, right)( - udf(F.localLess).apply(left, right) - ).as[Tile] + LessEqual(left, right) /** Cellwise less than or equal to value comparison between a tile and a scalar. */ - def local_less_equal_scalar[T: Numeric](tileCol: Column, value: T): TypedColumn[Any, Tile] = { - val f = value match{ - case i: Int ⇒ F.localLessEqualScalarInt(_: Tile, i) - case d: Double ⇒ F.localLessEqualScalar(_: Tile, d) - } - udf(f).apply(tileCol).as(s"local_less_equal_scalar($tileCol, $value)").as[Tile] - } + def local_less_equal[T: Numeric](tileCol: Column, value: T): TypedColumn[Any, Tile] = + LessEqual(tileCol, value) /** Cellwise greater than value comparison between two tiles. */ def local_greater(left: Column, right: Column): TypedColumn[Any, Tile] = - withAlias("local_greater", left, right)( - udf(F.localGreater).apply(left, right) - ).as[Tile] - + Greater(left, right) /** Cellwise greater than value comparison between a tile and a scalar. */ - def local_greater_scalar[T: Numeric](tileCol: Column, value: T): TypedColumn[Any, Tile] = { - val f = value match{ - case i: Int ⇒ F.localGreaterScalarInt(_: Tile, i) - case d: Double ⇒ F.localGreaterScalar(_: Tile, d) - } - udf(f).apply(tileCol).as(s"local_greater_scalar($tileCol, $value)").as[Tile] - } + def local_greater[T: Numeric](tileCol: Column, value: T): TypedColumn[Any, Tile] = + Greater(tileCol, value) /** Cellwise greater than or equal to value comparison between two tiles. */ def local_greater_equal(left: Column, right: Column): TypedColumn[Any, Tile] = - withAlias("local_greater_equal", left, right)( - udf(F.localGreaterEqual).apply(left, right) - ).as[Tile] + GreaterEqual(left, right) /** Cellwise greater than or equal to value comparison between a tile and a scalar. */ - def local_greater_equal_scalar[T: Numeric](tileCol: Column, value: T): TypedColumn[Any, Tile] = { - val f = value match{ - case i: Int ⇒ F.localGreaterEqualScalarInt(_: Tile, i) - case d: Double ⇒ F.localGreaterEqualScalar(_: Tile, d) - } - udf(f).apply(tileCol).as(s"local_greater_equal_scalar($tileCol, $value)").as[Tile] - } + def local_greater_equal[T: Numeric](tileCol: Column, value: T): TypedColumn[Any, Tile] = + GreaterEqual(tileCol, value) /** Cellwise equal to value comparison between two tiles. */ def local_equal(left: Column, right: Column): TypedColumn[Any, Tile] = - withAlias("local_equal", left, right)( - udf(F.localEqual).apply(left, right) - ).as[Tile] + Equal(left, right) /** Cellwise equal to value comparison between a tile and a scalar. */ - def local_equal_scalar[T: Numeric](tileCol: Column, value: T): TypedColumn[Any, Tile] = { - val f = value match{ - case i: Int ⇒ F.localEqualScalarInt(_: Tile, i) - case d: Double ⇒ F.localEqualScalar(_: Tile, d) - } - udf(f).apply(tileCol).as(s"local_equal_scalar($tileCol, $value)").as[Tile] - } + def local_equal[T: Numeric](tileCol: Column, value: T): TypedColumn[Any, Tile] = + Equal(tileCol, value) + /** Cellwise inequality comparison between two tiles. */ def local_unequal(left: Column, right: Column): TypedColumn[Any, Tile] = - withAlias("local_unequal", left, right)( - udf(F.localUnequal).apply(left, right) - ).as[Tile] + Unequal(left, right) /** Cellwise inequality comparison between a tile and a scalar. */ - def local_unequal_scalar[T: Numeric](tileCol: Column, value: T): TypedColumn[Any, Tile] = { - val f = value match{ - case i: Int ⇒ F.localUnequalScalarInt(_: Tile, i) - case d: Double ⇒ F.localUnequalScalar(_: Tile, d) - } - udf(f).apply(tileCol).as(s"local_unequal_scalar($tileCol, $value)").as[Tile] - } + def local_unequal[T: Numeric](tileCol: Column, value: T): TypedColumn[Any, Tile] = + Unequal(tileCol, value) + + /** Round cell values to nearest integer without chaning cell type. */ + def round(tileCol: Column): TypedColumn[Any, Tile] = + Round(tileCol) + + /** Take natural logarithm of cell values. */ + def log(tileCol: Column): TypedColumn[Any, Tile] = + Log(tileCol) + + /** Take base 10 logarithm of cell values. */ + def log10(tileCol: Column): TypedColumn[Any, Tile] = + Log10(tileCol) + + /** Take base 2 logarithm of cell values. */ + def log2(tileCol: Column): TypedColumn[Any, Tile] = + Log2(tileCol) + + /** Natural logarithm of one plus cell values. */ + def log1p(tileCol: Column): TypedColumn[Any, Tile] = + Log1p(tileCol) + + /** Exponential of cell values */ + def exp(tileCol: Column): TypedColumn[Any, Tile] = + Exp(tileCol) + + /** Ten to the power of cell values */ + def exp10(tileCol: Column): TypedColumn[Any, Tile] = + Exp10(tileCol) + + /** Two to the power of cell values */ + def exp2(tileCol: Column): TypedColumn[Any, Tile] = + Exp2(tileCol) + + /** Exponential of cell values, less one*/ + def expm1(tileCol: Column): TypedColumn[Any, Tile] = + ExpM1(tileCol) + + /** Resample tile using nearest-neighbor */ + def resample[T: Numeric](tileCol: Column, value: T) = Resample(tileCol, value) + + /** Resample tile using nearest-neighbor */ + def resample(tileCol: Column, column2: Column) = Resample(tileCol, column2) + } diff --git a/core/src/main/scala/astraea/spark/rasterframes/StandardColumns.scala b/core/src/main/scala/astraea/spark/rasterframes/StandardColumns.scala index 2a0104b8e..340b17198 100644 --- a/core/src/main/scala/astraea/spark/rasterframes/StandardColumns.scala +++ b/core/src/main/scala/astraea/spark/rasterframes/StandardColumns.scala @@ -6,17 +6,16 @@ import geotrellis.raster.{Tile, TileFeature} import geotrellis.spark.{SpatialKey, TemporalKey} import org.apache.spark.sql.functions.col import com.vividsolutions.jts.geom.{Point => jtsPoint, Polygon => jtsPolygon} -import astraea.spark.rasterframes.encoders.SparkDefaultEncoders._ -import astraea.spark.rasterframes.encoders.StandardEncoders import geotrellis.proj4.CRS import geotrellis.vector.Extent +import astraea.spark.rasterframes.encoders.StandardEncoders.PrimitiveEncoders._ /** * Constants identifying column in most RasterFrames. * * @since 2/19/18 */ -trait StandardColumns extends StandardEncoders { +trait StandardColumns { /** Default RasterFrame spatial column name. */ val SPATIAL_KEY_COLUMN = col("spatial_key").as[SpatialKey] diff --git a/core/src/main/scala/astraea/spark/rasterframes/encoders/CatalystSerializer.scala b/core/src/main/scala/astraea/spark/rasterframes/encoders/CatalystSerializer.scala index e172c5b92..3f09e1f38 100644 --- a/core/src/main/scala/astraea/spark/rasterframes/encoders/CatalystSerializer.scala +++ b/core/src/main/scala/astraea/spark/rasterframes/encoders/CatalystSerializer.scala @@ -22,15 +22,9 @@ package astraea.spark.rasterframes.encoders import astraea.spark.rasterframes.encoders.CatalystSerializer.CatalystIO -import astraea.spark.rasterframes.ref.{RasterRef, RasterSource} -import astraea.spark.rasterframes.util.CRSParser -import com.vividsolutions.jts.geom.Envelope -import geotrellis.proj4.CRS -import geotrellis.raster.{CellType, Tile} -import geotrellis.vector.Extent import org.apache.spark.sql.Row import org.apache.spark.sql.catalyst.InternalRow -import org.apache.spark.sql.rf.{RasterSourceUDT, TileUDT} +import org.apache.spark.sql.catalyst.util.ArrayData import org.apache.spark.sql.types._ import org.apache.spark.unsafe.types.UTF8String @@ -53,7 +47,7 @@ trait CatalystSerializer[T] extends Serializable { final def fromInternalRow(row: InternalRow): T = from(row, CatalystIO[InternalRow]) } -object CatalystSerializer { +object CatalystSerializer extends StandardSerializers { def apply[T: CatalystSerializer]: CatalystSerializer[T] = implicitly /** @@ -66,6 +60,9 @@ object CatalystSerializer { trait CatalystIO[R] extends Serializable { def create(values: Any*): R def to[T: CatalystSerializer](t: T): R = CatalystSerializer[T].to(t, this) + def toSeq[T: CatalystSerializer](t: Seq[T]): AnyRef + def get[T: CatalystSerializer](d: R, ordinal: Int): T + def getSeq[T: CatalystSerializer](d: R, ordinal: Int): Seq[T] def isNullAt(d: R, ordinal: Int): Boolean def getBoolean(d: R, ordinal: Int): Boolean def getByte(d: R, ordinal: Int): Byte @@ -76,7 +73,6 @@ object CatalystSerializer { def getDouble(d: R, ordinal: Int): Double def getString(d: R, ordinal: Int): String def getByteArray(d: R, ordinal: Int): Array[Byte] - def get[T: CatalystSerializer](d: R, ordinal: Int): T def encode(str: String): AnyRef } @@ -93,11 +89,17 @@ object CatalystSerializer { override def getFloat(d: R, ordinal: Int): Float = d.getFloat(ordinal) override def getDouble(d: R, ordinal: Int): Double = d.getDouble(ordinal) override def getString(d: R, ordinal: Int): String = d.getString(ordinal) - override def getByteArray(d: R, ordinal: Int): Array[Byte] = d.get(ordinal).asInstanceOf[Array[Byte]] + override def getByteArray(d: R, ordinal: Int): Array[Byte] = + d.get(ordinal).asInstanceOf[Array[Byte]] override def get[T: CatalystSerializer](d: R, ordinal: Int): T = { - val struct = d.getStruct(ordinal) - struct.to[T] + d.getAs[Any](ordinal) match { + case r: Row => r.to[T] + case o => o.asInstanceOf[T] + } } + override def toSeq[T: CatalystSerializer](t: Seq[T]): AnyRef = t.map(_.toRow) + override def getSeq[T: CatalystSerializer](d: R, ordinal: Int): Seq[T] = + d.getSeq[Row](ordinal).map(_.to[T]) override def encode(str: String): String = str } @@ -122,92 +124,22 @@ object CatalystSerializer { struct.to[T] } override def create(values: Any*): InternalRow = InternalRow(values: _*) + override def toSeq[T: CatalystSerializer](t: Seq[T]): ArrayData = + ArrayData.toArrayData(t.map(_.toInternalRow).toArray) + + override def getSeq[T: CatalystSerializer](d: InternalRow, ordinal: Int): Seq[T] = { + val ad = d.getArray(ordinal) + val result = Array.ofDim[Any](ad.numElements()).asInstanceOf[Array[T]] + ad.foreach( + CatalystSerializer[T].schema, + (i, v) => result(i) = v.asInstanceOf[InternalRow].to[T] + ) + result.toSeq + } override def encode(str: String): UTF8String = UTF8String.fromString(str) } } - implicit val envelopeSerializer: CatalystSerializer[Envelope] = new CatalystSerializer[Envelope] { - override def schema: StructType = StructType(Seq( - StructField("minX", DoubleType, false), - StructField("maxX", DoubleType, false), - StructField("minY", DoubleType, false), - StructField("maxY", DoubleType, false) - )) - - override protected def to[R](t: Envelope, io: CatalystIO[R]): R = io.create( - t.getMinX, t.getMaxX, t.getMinY, t.getMaxX - ) - - override protected def from[R](t: R, io: CatalystIO[R]): Envelope = new Envelope( - io.getDouble(t, 0), io.getDouble(t, 1), io.getDouble(t, 2), io.getDouble(t, 3) - ) - } - - implicit val extentSerializer: CatalystSerializer[Extent] = new CatalystSerializer[Extent] { - override def schema: StructType = StructType(Seq( - StructField("xmin", DoubleType, false), - StructField("ymin", DoubleType, false), - StructField("xmax", DoubleType, false), - StructField("ymax", DoubleType, false) - )) - override def to[R](t: Extent, io: CatalystIO[R]): R = io.create( - t.xmin, t.ymin, t.xmax, t.ymax - ) - override def from[R](row: R, io: CatalystIO[R]): Extent = Extent( - io.getDouble(row, 0), io.getDouble(row, 1), io.getDouble(row, 2), io.getDouble(row, 3) - ) - } - - implicit val crsSerializer: CatalystSerializer[CRS] = new CatalystSerializer[CRS] { - override def schema: StructType = StructType(Seq( - StructField("crsProj4", StringType, false) - )) - override def to[R](t: CRS, io: CatalystIO[R]): R = io.create( - io.encode( - // Don't do this... it's 1000x slower to decode. - //t.epsgCode.map(c => "EPSG:" + c).getOrElse(t.toProj4String) - t.toProj4String - ) - ) - override def from[R](row: R, io: CatalystIO[R]): CRS = - CRSParser(io.getString(row, 0)) - } - - implicit val cellTypeSerializer: CatalystSerializer[CellType] = new CatalystSerializer[CellType] { - override def schema: StructType = StructType(Seq( - StructField("cellTypeName", StringType, false) - )) - override def to[R](t: CellType, io: CatalystIO[R]): R = io.create( - io.encode(t.toString()) - ) - override def from[R](row: R, io: CatalystIO[R]): CellType = - CellType.fromName(io.getString(row, 0)) - } - - implicit val rasterRefSerializer: CatalystSerializer[RasterRef] = new CatalystSerializer[RasterRef] { - val rsType = new RasterSourceUDT() - override def schema: StructType = StructType(Seq( - StructField("source", rsType, false), - StructField("subextent", apply[Extent].schema, true) - )) - - override def to[R](t: RasterRef, io: CatalystIO[R]): R = io.create( - io.to(t.source), - t.subextent.map(io.to[Extent]).orNull - ) - - override def from[R](row: R, io: CatalystIO[R]): RasterRef = RasterRef( - io.get[RasterSource](row, 0), - if (io.isNullAt(row, 1)) None - else Option(io.get[Extent](row, 1)) - ) - } - - private[rasterframes] - implicit def tileSerializer: CatalystSerializer[Tile] = TileUDT.tileSerializer - private[rasterframes] - implicit def rasterSourceSerializer: CatalystSerializer[RasterSource] = RasterSourceUDT.rasterSourceSerializer - implicit class WithToRow[T: CatalystSerializer](t: T) { def toInternalRow: InternalRow = CatalystSerializer[T].toInternalRow(t) def toRow: Row = CatalystSerializer[T].toRow(t) diff --git a/core/src/main/scala/astraea/spark/rasterframes/encoders/CatalystSerializerEncoder.scala b/core/src/main/scala/astraea/spark/rasterframes/encoders/CatalystSerializerEncoder.scala index a1538e84a..27e452329 100644 --- a/core/src/main/scala/astraea/spark/rasterframes/encoders/CatalystSerializerEncoder.scala +++ b/core/src/main/scala/astraea/spark/rasterframes/encoders/CatalystSerializerEncoder.scala @@ -21,47 +21,62 @@ package astraea.spark.rasterframes.encoders import org.apache.spark.sql.catalyst.analysis.GetColumnByOrdinal -import org.apache.spark.sql.catalyst.{InternalRow, ScalaReflection} -import org.apache.spark.sql.catalyst.encoders.{ExpressionEncoder, RowEncoder} -import org.apache.spark.sql.catalyst.expressions.codegen.CodegenFallback +import org.apache.spark.sql.catalyst.encoders.ExpressionEncoder import org.apache.spark.sql.catalyst.expressions._ -import org.apache.spark.sql.types.{DataType, StructField, StructType} +import org.apache.spark.sql.catalyst.expressions.codegen.{CodegenContext, ExprCode} +import org.apache.spark.sql.catalyst.{InternalRow, ScalaReflection} +import org.apache.spark.sql.types.{DataType, ObjectType, StructField, StructType} import scala.reflect.runtime.universe.TypeTag object CatalystSerializerEncoder { case class CatSerializeToRow[T](child: Expression, serde: CatalystSerializer[T]) - extends UnaryExpression with CodegenFallback { + extends UnaryExpression { override def dataType: DataType = serde.schema override protected def nullSafeEval(input: Any): Any = { val value = input.asInstanceOf[T] serde.toInternalRow(value) } + override protected def doGenCode(ctx: CodegenContext, ev: ExprCode): ExprCode = { + val cs = ctx.addReferenceObj("serde", serde, serde.getClass.getName) + nullSafeCodeGen(ctx, ev, input => s"${ev.value} = $cs.toInternalRow($input);") + } } case class CatDeserializeFromRow[T](child: Expression, serde: CatalystSerializer[T], outputType: DataType) - extends UnaryExpression with CodegenFallback { + extends UnaryExpression { override def dataType: DataType = outputType + + private def objType = outputType match { + case ot: ObjectType => ot.cls.getName + case o => s"java.lang.Object /* $o */" // not sure what to do here... hopefully shouldn't happen + } override protected def nullSafeEval(input: Any): Any = { val row = input.asInstanceOf[InternalRow] serde.fromInternalRow(row) } + override protected def doGenCode(ctx: CodegenContext, ev: ExprCode): ExprCode = { + val cs = ctx.addReferenceObj("serde", serde, classOf[CatalystSerializer[_]].getName) + nullSafeCodeGen(ctx, ev, input => s"${ev.value} = ($objType) $cs.fromInternalRow($input);") + } } - def apply[T: TypeTag: CatalystSerializer]: ExpressionEncoder[T] = { + def apply[T: TypeTag: CatalystSerializer](flat: Boolean = false): ExpressionEncoder[T] = { val serde = CatalystSerializer[T] - val schema = StructType(Seq( - StructField("value", serde.schema) - )) + val schema = if (flat) + StructType(Seq( + StructField("value", serde.schema, true) + )) + else serde.schema val parentType: DataType = ScalaReflection.dataTypeFor[T] - val inputObject = BoundReference(0, parentType, nullable = false) + val inputObject = BoundReference(0, parentType, nullable = true) val serializer = CatSerializeToRow(inputObject, serde) val deserializer: Expression = CatDeserializeFromRow(GetColumnByOrdinal(0, schema), serde, parentType) - ExpressionEncoder(schema, flat = false, Seq(serializer), deserializer, typeToClassTag[T]) + ExpressionEncoder(schema, flat = flat, Seq(serializer), deserializer, typeToClassTag[T]) } } diff --git a/core/src/main/scala/astraea/spark/rasterframes/encoders/CellTypeEncoder.scala b/core/src/main/scala/astraea/spark/rasterframes/encoders/CellTypeEncoder.scala index 82df2bdff..953c2ed65 100644 --- a/core/src/main/scala/astraea/spark/rasterframes/encoders/CellTypeEncoder.scala +++ b/core/src/main/scala/astraea/spark/rasterframes/encoders/CellTypeEncoder.scala @@ -24,9 +24,9 @@ import org.apache.spark.sql.catalyst.ScalaReflection import org.apache.spark.sql.catalyst.analysis.GetColumnByOrdinal import org.apache.spark.sql.catalyst.encoders.ExpressionEncoder import org.apache.spark.sql.rf.VersionShims.InvokeSafely -import org.apache.spark.sql.types.{ObjectType, StringType, StructField, StructType} +import org.apache.spark.sql.types.{ObjectType, StringType} import org.apache.spark.unsafe.types.UTF8String -import CatalystSerializer._ + import scala.reflect.classTag /** diff --git a/core/src/main/scala/astraea/spark/rasterframes/encoders/EnvelopeEncoder.scala b/core/src/main/scala/astraea/spark/rasterframes/encoders/EnvelopeEncoder.scala index f65227e9a..5888a1974 100644 --- a/core/src/main/scala/astraea/spark/rasterframes/encoders/EnvelopeEncoder.scala +++ b/core/src/main/scala/astraea/spark/rasterframes/encoders/EnvelopeEncoder.scala @@ -8,7 +8,7 @@ import org.apache.spark.sql.catalyst.expressions.objects.NewInstance import org.apache.spark.sql.catalyst.expressions.{BoundReference, CreateNamedStruct, Literal} import org.apache.spark.sql.rf.VersionShims.InvokeSafely import org.apache.spark.sql.types._ -import CatalystSerializer._ + import scala.reflect.classTag /** diff --git a/core/src/main/scala/astraea/spark/rasterframes/encoders/SparkDefaultEncoders.scala b/core/src/main/scala/astraea/spark/rasterframes/encoders/SparkBasicEncoders.scala similarity index 74% rename from core/src/main/scala/astraea/spark/rasterframes/encoders/SparkDefaultEncoders.scala rename to core/src/main/scala/astraea/spark/rasterframes/encoders/SparkBasicEncoders.scala index 9763439f9..670d2e217 100644 --- a/core/src/main/scala/astraea/spark/rasterframes/encoders/SparkDefaultEncoders.scala +++ b/core/src/main/scala/astraea/spark/rasterframes/encoders/SparkBasicEncoders.scala @@ -29,11 +29,12 @@ import scala.reflect.runtime.universe._ * * @since 12/28/17 */ -private[rasterframes] trait SparkDefaultEncoders { +private[rasterframes] trait SparkBasicEncoders { implicit def arrayEnc[T: TypeTag]: Encoder[Array[T]] = ExpressionEncoder() - implicit def genEnc[T: TypeTag]: Encoder[T] = ExpressionEncoder() - implicit val intEnc = Encoders.scalaInt - implicit val stringEnc = Encoders.STRING + implicit val intEnc: Encoder[Int] = Encoders.scalaInt + implicit val longEnc: Encoder[Long] = Encoders.scalaLong + implicit val stringEnc: Encoder[String] = Encoders.STRING + implicit val doubleEnc: Encoder[Double] = Encoders.scalaDouble + implicit val boolEnc: Encoder[Boolean] = Encoders.scalaBoolean } -private[rasterframes] object SparkDefaultEncoders extends SparkDefaultEncoders diff --git a/core/src/main/scala/astraea/spark/rasterframes/encoders/StandardEncoders.scala b/core/src/main/scala/astraea/spark/rasterframes/encoders/StandardEncoders.scala index 49fb82b0e..625eea1cd 100644 --- a/core/src/main/scala/astraea/spark/rasterframes/encoders/StandardEncoders.scala +++ b/core/src/main/scala/astraea/spark/rasterframes/encoders/StandardEncoders.scala @@ -20,13 +20,13 @@ package astraea.spark.rasterframes.encoders import java.net.URI +import java.sql.Timestamp -import astraea.spark.rasterframes.ref.{RasterRef, RasterSource} -import astraea.spark.rasterframes.stats.{CellHistogram, CellStatistics} -import astraea.spark.rasterframes.tiles.ProjectedRasterTile +import astraea.spark.rasterframes.model._ +import astraea.spark.rasterframes.stats.{CellHistogram, CellStatistics, LocalCellStatistics} import com.vividsolutions.jts.geom.Envelope import geotrellis.proj4.CRS -import geotrellis.raster.{CellType, ProjectedRaster, Tile} +import geotrellis.raster.{CellSize, CellType, Tile, TileLayout} import geotrellis.spark.tiling.LayoutDefinition import geotrellis.spark.{KeyBounds, SpaceTimeKey, SpatialKey, TemporalKey, TemporalProjectedExtent, TileLayerMetadata} import geotrellis.vector.{Extent, ProjectedExtent} @@ -38,28 +38,36 @@ import scala.reflect.runtime.universe._ /** * Implicit encoder definitions for RasterFrame types. */ -trait StandardEncoders extends SpatialEncoders{ +trait StandardEncoders extends SpatialEncoders { + object PrimitiveEncoders extends SparkBasicEncoders + def expressionEncoder[T: TypeTag]: ExpressionEncoder[T] = ExpressionEncoder() implicit def spatialKeyEncoder: ExpressionEncoder[SpatialKey] = ExpressionEncoder() implicit def temporalKeyEncoder: ExpressionEncoder[TemporalKey] = ExpressionEncoder() implicit def spaceTimeKeyEncoder: ExpressionEncoder[SpaceTimeKey] = ExpressionEncoder() - implicit def statsEncoder: ExpressionEncoder[CellStatistics] = ExpressionEncoder() - implicit def histEncoder: ExpressionEncoder[CellHistogram] = ExpressionEncoder() implicit def layoutDefinitionEncoder: ExpressionEncoder[LayoutDefinition] = ExpressionEncoder() implicit def stkBoundsEncoder: ExpressionEncoder[KeyBounds[SpaceTimeKey]] = ExpressionEncoder() - implicit def extentEncoder: ExpressionEncoder[Extent] = ExpressionEncoder() - + implicit def extentEncoder: ExpressionEncoder[Extent] = ExpressionEncoder[Extent]() implicit def singlebandTileEncoder: ExpressionEncoder[Tile] = ExpressionEncoder() - implicit def projectedRasterTileEncoder: ExpressionEncoder[ProjectedRasterTile] = ExpressionEncoder() implicit def tileLayerMetadataEncoder[K: TypeTag]: ExpressionEncoder[TileLayerMetadata[K]] = TileLayerMetadataEncoder() implicit def crsEncoder: ExpressionEncoder[CRS] = CRSEncoder() implicit def projectedExtentEncoder: ExpressionEncoder[ProjectedExtent] = ProjectedExtentEncoder() implicit def temporalProjectedExtentEncoder: ExpressionEncoder[TemporalProjectedExtent] = TemporalProjectedExtentEncoder() implicit def cellTypeEncoder: ExpressionEncoder[CellType] = CellTypeEncoder() + implicit def cellSizeEncoder: ExpressionEncoder[CellSize] = ExpressionEncoder() implicit def uriEncoder: ExpressionEncoder[URI] = URIEncoder() implicit def envelopeEncoder: ExpressionEncoder[Envelope] = EnvelopeEncoder() - implicit def rrEncoder: ExpressionEncoder[RasterRef] = ExpressionEncoder() - implicit def prEncoder: ExpressionEncoder[ProjectedRaster[Tile]] = ExpressionEncoder() - implicit def rsEncoder: ExpressionEncoder[RasterSource] = ExpressionEncoder() + implicit def timestampEncoder: ExpressionEncoder[Timestamp] = ExpressionEncoder() + implicit def strMapEncoder: ExpressionEncoder[Map[String, String]] = ExpressionEncoder() + implicit def cellStatsEncoder: ExpressionEncoder[CellStatistics] = ExpressionEncoder() + implicit def cellHistEncoder: ExpressionEncoder[CellHistogram] = ExpressionEncoder() + implicit def localCellStatsEncoder: ExpressionEncoder[LocalCellStatistics] = ExpressionEncoder() + implicit def tilelayoutEncoder: ExpressionEncoder[TileLayout] = ExpressionEncoder() + implicit def cellContextEncoder: ExpressionEncoder[CellContext] = CellContext.encoder + implicit def cellsEncoder: ExpressionEncoder[Cells] = Cells.encoder + implicit def tileContextEncoder: ExpressionEncoder[TileContext] = TileContext.encoder + implicit def tileDataContextEncoder: ExpressionEncoder[TileDataContext] = TileDataContext.encoder + + } object StandardEncoders extends StandardEncoders diff --git a/core/src/main/scala/astraea/spark/rasterframes/encoders/StandardSerializers.scala b/core/src/main/scala/astraea/spark/rasterframes/encoders/StandardSerializers.scala new file mode 100644 index 000000000..aaff5c534 --- /dev/null +++ b/core/src/main/scala/astraea/spark/rasterframes/encoders/StandardSerializers.scala @@ -0,0 +1,251 @@ +/* + * This software is licensed under the Apache 2 license, quoted below. + * + * Copyright 2019 Astraea, Inc. + * + * Licensed under the Apache License, Version 2.0 (the "License"); you may not + * use this file except in compliance with the License. You may obtain a copy of + * the License at + * + * [http://www.apache.org/licenses/LICENSE-2.0] + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT + * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the + * License for the specific language governing permissions and limitations under + * the License. + * + * SPDX-License-Identifier: Apache-2.0 + * + */ + +package astraea.spark.rasterframes.encoders +import astraea.spark.rasterframes.encoders.CatalystSerializer.CatalystIO +import astraea.spark.rasterframes.util.CRSParser +import com.vividsolutions.jts.geom.Envelope +import geotrellis.proj4.CRS +import geotrellis.raster._ +import geotrellis.spark._ +import geotrellis.spark.tiling.LayoutDefinition +import geotrellis.vector._ +import org.apache.spark.sql.types._ + +/** Collection of CatalystSerializers for third-party types. */ +trait StandardSerializers { + + implicit val envelopeSerializer: CatalystSerializer[Envelope] = new CatalystSerializer[Envelope] { + override def schema: StructType = StructType(Seq( + StructField("minX", DoubleType, false), + StructField("maxX", DoubleType, false), + StructField("minY", DoubleType, false), + StructField("maxY", DoubleType, false) + )) + + override protected def to[R](t: Envelope, io: CatalystIO[R]): R = io.create( + t.getMinX, t.getMaxX, t.getMinY, t.getMaxX + ) + + override protected def from[R](t: R, io: CatalystIO[R]): Envelope = new Envelope( + io.getDouble(t, 0), io.getDouble(t, 1), io.getDouble(t, 2), io.getDouble(t, 3) + ) + } + + implicit val extentSerializer: CatalystSerializer[Extent] = new CatalystSerializer[Extent] { + override def schema: StructType = StructType(Seq( + StructField("xmin", DoubleType, false), + StructField("ymin", DoubleType, false), + StructField("xmax", DoubleType, false), + StructField("ymax", DoubleType, false) + )) + override def to[R](t: Extent, io: CatalystIO[R]): R = io.create( + t.xmin, t.ymin, t.xmax, t.ymax + ) + override def from[R](row: R, io: CatalystIO[R]): Extent = Extent( + io.getDouble(row, 0), io.getDouble(row, 1), io.getDouble(row, 2), io.getDouble(row, 3) + ) + } + + implicit val crsSerializer: CatalystSerializer[CRS] = new CatalystSerializer[CRS] { + override def schema: StructType = StructType(Seq( + StructField("crsProj4", StringType, false) + )) + override def to[R](t: CRS, io: CatalystIO[R]): R = io.create( + io.encode( + // Don't do this... it's 1000x slower to decode. + //t.epsgCode.map(c => "EPSG:" + c).getOrElse(t.toProj4String) + t.toProj4String + ) + ) + override def from[R](row: R, io: CatalystIO[R]): CRS = + CRSParser(io.getString(row, 0)) + } + + implicit val cellTypeSerializer: CatalystSerializer[CellType] = new CatalystSerializer[CellType] { + override def schema: StructType = StructType(Seq( + StructField("cellTypeName", StringType, false) + )) + override def to[R](t: CellType, io: CatalystIO[R]): R = io.create( + io.encode(t.toString()) + ) + override def from[R](row: R, io: CatalystIO[R]): CellType = + CellType.fromName(io.getString(row, 0)) + } + + implicit val projectedExtentSerializer: CatalystSerializer[ProjectedExtent] = new CatalystSerializer[ProjectedExtent] { + override def schema: StructType = StructType(Seq( + StructField("extent", CatalystSerializer[Extent].schema, false), + StructField("crs", CatalystSerializer[CRS].schema, false) + )) + + override protected def to[R](t: ProjectedExtent, io: CatalystSerializer.CatalystIO[R]): R = io.create( + io.to(t.extent), + io.to(t.crs) + ) + + override protected def from[R](t: R, io: CatalystSerializer.CatalystIO[R]): ProjectedExtent = ProjectedExtent( + io.get[Extent](t, 0), + io.get[CRS](t, 1) + ) + } + + implicit val spatialKeySerializer: CatalystSerializer[SpatialKey] = new CatalystSerializer[SpatialKey] { + override def schema: StructType = StructType(Seq( + StructField("col", IntegerType, false), + StructField("row", IntegerType, false) + )) + + override protected def to[R](t: SpatialKey, io: CatalystIO[R]): R = io.create( + t.col, + t.row + ) + + override protected def from[R](t: R, io: CatalystIO[R]): SpatialKey = SpatialKey( + io.getInt(t, 0), + io.getInt(t, 1) + ) + } + + implicit val spacetimeKeySerializer: CatalystSerializer[SpaceTimeKey] = new CatalystSerializer[SpaceTimeKey] { + override def schema: StructType = StructType(Seq( + StructField("col", IntegerType, false), + StructField("row", IntegerType, false), + StructField("instant", LongType, false) + )) + + override protected def to[R](t: SpaceTimeKey, io: CatalystIO[R]): R = io.create( + t.col, + t.row, + t.instant + ) + + override protected def from[R](t: R, io: CatalystIO[R]): SpaceTimeKey = SpaceTimeKey( + io.getInt(t, 0), + io.getInt(t, 1), + io.getLong(t, 2) + ) + } + + implicit val cellSizeSerializer: CatalystSerializer[CellSize] = new CatalystSerializer[CellSize] { + override def schema: StructType = StructType(Seq( + StructField("width", DoubleType, false), + StructField("height", DoubleType, false) + )) + + override protected def to[R](t: CellSize, io: CatalystIO[R]): R = io.create( + t.width, + t.height + ) + + override protected def from[R](t: R, io: CatalystIO[R]): CellSize = CellSize( + io.getDouble(t, 0), + io.getDouble(t, 1) + ) + } + + implicit val tileLayoutSerializer: CatalystSerializer[TileLayout] = new CatalystSerializer[TileLayout] { + override def schema: StructType = StructType(Seq( + StructField("layoutCols", IntegerType, false), + StructField("layoutRows", IntegerType, false), + StructField("tileCols", IntegerType, false), + StructField("tileRows", IntegerType, false) + )) + + override protected def to[R](t: TileLayout, io: CatalystIO[R]): R = io.create( + t.layoutCols, + t.layoutRows, + t.tileCols, + t.tileRows + ) + + override protected def from[R](t: R, io: CatalystIO[R]): TileLayout = TileLayout( + io.getInt(t, 0), + io.getInt(t, 1), + io.getInt(t, 2), + io.getInt(t, 3) + ) + } + + implicit val layoutDefinitionSerializer = new CatalystSerializer[LayoutDefinition] { + override def schema: StructType = StructType(Seq( + StructField("extent", CatalystSerializer[Extent].schema, true), + StructField("tileLayout", CatalystSerializer[TileLayout].schema, true) + )) + + override protected def to[R](t: LayoutDefinition, io: CatalystIO[R]): R = io.create( + io.to(t.extent), + io.to(t.tileLayout) + ) + + override protected def from[R](t: R, io: CatalystIO[R]): LayoutDefinition = LayoutDefinition( + io.get[Extent](t, 0), + io.get[TileLayout](t, 1) + ) + } + + implicit def boundsSerializer[T: CatalystSerializer]: CatalystSerializer[KeyBounds[T]] = new CatalystSerializer[KeyBounds[T]] { + override def schema: StructType = StructType(Seq( + StructField("minKey", CatalystSerializer[T].schema, true), + StructField("maxKey", CatalystSerializer[T].schema, true) + )) + + override protected def to[R](t: KeyBounds[T], io: CatalystIO[R]): R = io.create( + io.to(t.get.minKey), + io.to(t.get.maxKey) + ) + + override protected def from[R](t: R, io: CatalystIO[R]): KeyBounds[T] = KeyBounds( + io.get[T](t, 0), + io.get[T](t, 1) + ) + } + + def tileLayerMetadataSerializer[T: CatalystSerializer]: CatalystSerializer[TileLayerMetadata[T]] = new CatalystSerializer[TileLayerMetadata[T]] { + override def schema: StructType = StructType(Seq( + StructField("cellType", CatalystSerializer[CellType].schema, false), + StructField("layout", CatalystSerializer[LayoutDefinition].schema, false), + StructField("extent", CatalystSerializer[Extent].schema, false), + StructField("crs", CatalystSerializer[CRS].schema, false), + StructField("bounds", CatalystSerializer[KeyBounds[T]].schema, false) + )) + + override protected def to[R](t: TileLayerMetadata[T], io: CatalystIO[R]): R = io.create( + io.to(t.cellType), + io.to(t.layout), + io.to(t.extent), + io.to(t.crs), + io.to(t.bounds.head) + ) + + override protected def from[R](t: R, io: CatalystIO[R]): TileLayerMetadata[T] = TileLayerMetadata( + io.get[CellType](t, 0), + io.get[LayoutDefinition](t, 1), + io.get[Extent](t, 2), + io.get[CRS](t, 3), + io.get[KeyBounds[T]](t, 4) + ) + } + + implicit val spatialKeyTLMSerializer = tileLayerMetadataSerializer[SpatialKey] + implicit val spaceTimeKeyTLMSerializer = tileLayerMetadataSerializer[SpaceTimeKey] + +} diff --git a/core/src/main/scala/astraea/spark/rasterframes/expressions/BinaryLocalRasterOp.scala b/core/src/main/scala/astraea/spark/rasterframes/expressions/BinaryLocalRasterOp.scala new file mode 100644 index 000000000..3fac44c65 --- /dev/null +++ b/core/src/main/scala/astraea/spark/rasterframes/expressions/BinaryLocalRasterOp.scala @@ -0,0 +1,78 @@ +/* + * This software is licensed under the Apache 2 license, quoted below. + * + * Copyright 2019 Astraea, Inc. + * + * Licensed under the Apache License, Version 2.0 (the "License"); you may not + * use this file except in compliance with the License. You may obtain a copy of + * the License at + * + * [http://www.apache.org/licenses/LICENSE-2.0] + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT + * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the + * License for the specific language governing permissions and limitations under + * the License. + * + * SPDX-License-Identifier: Apache-2.0 + * + */ + +package astraea.spark.rasterframes.expressions + +import astraea.spark.rasterframes.encoders.CatalystSerializer._ +import astraea.spark.rasterframes.expressions.DynamicExtractors._ +import com.typesafe.scalalogging.LazyLogging +import geotrellis.raster.Tile +import org.apache.spark.sql.catalyst.analysis.TypeCheckResult +import org.apache.spark.sql.catalyst.analysis.TypeCheckResult.{TypeCheckFailure, TypeCheckSuccess} +import org.apache.spark.sql.catalyst.expressions.BinaryExpression +import org.apache.spark.sql.rf.TileUDT +import org.apache.spark.sql.types.DataType + +/** Operation combining two tiles or a tile and a scalar into a new tile. */ +trait BinaryLocalRasterOp extends BinaryExpression with LazyLogging { + + override def dataType: DataType = left.dataType + + override def checkInputDataTypes(): TypeCheckResult = { + if (!tileExtractor.isDefinedAt(left.dataType)) { + TypeCheckFailure(s"Input type '${left.dataType}' does not conform to a raster type.") + } + else if (!tileOrNumberExtractor.isDefinedAt(right.dataType)) { + TypeCheckFailure(s"Input type '${right.dataType}' does not conform to a compatible type.") + } + else TypeCheckSuccess + } + + override protected def nullSafeEval(input1: Any, input2: Any): Any = { + implicit val tileSer = TileUDT.tileSerializer + val (leftTile, leftCtx) = tileExtractor(left.dataType)(row(input1)) + val result = tileOrNumberExtractor(right.dataType)(input2) match { + case TileArg(rightTile, rightCtx) => + if (leftCtx.isEmpty && rightCtx.isDefined) + logger.warn( + s"Right-hand parameter '${right}' provided an extent and CRS, but the left-hand parameter " + + s"'${left}' didn't have any. Because the left-hand side defines output type, the right-hand context will be lost.") + + if(leftCtx.isDefined && rightCtx.isDefined && leftCtx != rightCtx) + logger.warn(s"Both '${left}' and '${right}' provided an extent and CRS, but they are different. Left-hand side will be used.") + + op(leftTile, rightTile) + case DoubleArg(d) => op(fpTile(leftTile), d) + case IntegerArg(i) => op(leftTile, i) + } + + leftCtx match { + case Some(ctx) => ctx.toProjectRasterTile(result).toInternalRow + case None => result.toInternalRow + } + } + + + protected def op(left: Tile, right: Tile): Tile + protected def op(left: Tile, right: Double): Tile + protected def op(left: Tile, right: Int): Tile +} + diff --git a/core/src/main/scala/astraea/spark/rasterframes/expressions/BinaryRasterOp.scala b/core/src/main/scala/astraea/spark/rasterframes/expressions/BinaryRasterOp.scala new file mode 100644 index 000000000..02f8fc29e --- /dev/null +++ b/core/src/main/scala/astraea/spark/rasterframes/expressions/BinaryRasterOp.scala @@ -0,0 +1,70 @@ +/* + * This software is licensed under the Apache 2 license, quoted below. + * + * Copyright 2019 Astraea, Inc. + * + * Licensed under the Apache License, Version 2.0 (the "License"); you may not + * use this file except in compliance with the License. You may obtain a copy of + * the License at + * + * [http://www.apache.org/licenses/LICENSE-2.0] + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT + * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the + * License for the specific language governing permissions and limitations under + * the License. + * + * SPDX-License-Identifier: Apache-2.0 + * + */ + +package astraea.spark.rasterframes.expressions +import astraea.spark.rasterframes.expressions.DynamicExtractors.tileExtractor +import astraea.spark.rasterframes.encoders.CatalystSerializer._ +import com.typesafe.scalalogging.LazyLogging +import geotrellis.raster.Tile +import org.apache.spark.sql.catalyst.analysis.TypeCheckResult +import org.apache.spark.sql.catalyst.analysis.TypeCheckResult.{TypeCheckFailure, TypeCheckSuccess} +import org.apache.spark.sql.catalyst.expressions.BinaryExpression +import org.apache.spark.sql.rf.TileUDT +import org.apache.spark.sql.types.DataType + +/** Operation combining two tiles into a new tile. */ +trait BinaryRasterOp extends BinaryExpression with LazyLogging { + + override def dataType: DataType = left.dataType + + override def checkInputDataTypes(): TypeCheckResult = { + if (!tileExtractor.isDefinedAt(left.dataType)) { + TypeCheckFailure(s"Input type '${left.dataType}' does not conform to a raster type.") + } + else if (!tileExtractor.isDefinedAt(right.dataType)) { + TypeCheckFailure(s"Input type '${right.dataType}' does not conform to a raster type.") + } + else TypeCheckSuccess + } + + protected def op(left: Tile, right: Tile): Tile + + override protected def nullSafeEval(input1: Any, input2: Any): Any = { + implicit val tileSer = TileUDT.tileSerializer + val (leftTile, leftCtx) = tileExtractor(left.dataType)(row(input1)) + val (rightTile, rightCtx) = tileExtractor(right.dataType)(row(input2)) + + if (leftCtx.isEmpty && rightCtx.isDefined) + logger.warn( + s"Right-hand parameter '${right}' provided an extent and CRS, but the left-hand parameter " + + s"'${left}' didn't have any. Because the left-hand side defines output type, the right-hand context will be lost.") + + if(leftCtx.isDefined && rightCtx.isDefined && leftCtx != rightCtx) + logger.warn(s"Both '${left}' and '${right}' provided an extent and CRS, but they are different. Left-hand side will be used.") + + val result = op(leftTile, rightTile) + + leftCtx match { + case Some(ctx) => ctx.toProjectRasterTile(result).toInternalRow + case None => result.toInternalRow + } + } +} diff --git a/core/src/main/scala/astraea/spark/rasterframes/expressions/DynamicExtractors.scala b/core/src/main/scala/astraea/spark/rasterframes/expressions/DynamicExtractors.scala new file mode 100644 index 000000000..1dabc8201 --- /dev/null +++ b/core/src/main/scala/astraea/spark/rasterframes/expressions/DynamicExtractors.scala @@ -0,0 +1,114 @@ +/* + * This software is licensed under the Apache 2 license, quoted below. + * + * Copyright 2019 Astraea, Inc. + * + * Licensed under the Apache License, Version 2.0 (the "License"); you may not + * use this file except in compliance with the License. You may obtain a copy of + * the License at + * + * [http://www.apache.org/licenses/LICENSE-2.0] + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT + * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the + * License for the specific language governing permissions and limitations under + * the License. + * + * SPDX-License-Identifier: Apache-2.0 + * + */ + +package astraea.spark.rasterframes.expressions +import astraea.spark.rasterframes.encoders.CatalystSerializer +import astraea.spark.rasterframes.encoders.CatalystSerializer._ +import astraea.spark.rasterframes.model.TileContext +import astraea.spark.rasterframes.ref.{ProjectedRasterLike, RasterRef, RasterSource} +import astraea.spark.rasterframes.tiles.ProjectedRasterTile +import geotrellis.raster.{CellGrid, Tile} +import org.apache.spark.sql.Row +import org.apache.spark.sql.catalyst.InternalRow +import org.apache.spark.sql.rf.{TileUDT, _} +import org.apache.spark.sql.types._ + +private[expressions] +object DynamicExtractors { + /** Partial function for pulling a tile and its contesxt from an input row. */ + lazy val tileExtractor: PartialFunction[DataType, InternalRow => (Tile, Option[TileContext])] = { + case _: TileUDT => + (row: InternalRow) => + (row.to[Tile](TileUDT.tileSerializer), None) + case t if t.conformsTo(CatalystSerializer[ProjectedRasterTile].schema) => + (row: InternalRow) => { + val prt = row.to[ProjectedRasterTile] + (prt, Some(TileContext(prt))) + } + } + + lazy val rowTileExtractor: PartialFunction[DataType, Row => (Tile, Option[TileContext])] = { + case _: TileUDT => + (row: Row) => (row.to[Tile](TileUDT.tileSerializer), None) + case t if t.conformsTo(CatalystSerializer[ProjectedRasterTile].schema) => + (row: Row) => { + val prt = row.to[ProjectedRasterTile] + (prt, Some(TileContext(prt))) + } + } + + /** Partial function for pulling a ProjectedRasterLike an input row. */ + lazy val projectedRasterLikeExtractor: PartialFunction[DataType, InternalRow ⇒ ProjectedRasterLike] = { + case _: RasterSourceUDT ⇒ + (row: InternalRow) ⇒ row.to[RasterSource](RasterSourceUDT.rasterSourceSerializer) + case t if t.conformsTo(CatalystSerializer[ProjectedRasterTile].schema) => + (row: InternalRow) => row.to[ProjectedRasterTile] + case t if t.conformsTo(CatalystSerializer[RasterRef].schema) => + (row: InternalRow) ⇒ row.to[RasterRef] + } + + /** Partial function for pulling a CellGrid from an input row. */ + lazy val gridExtractor: PartialFunction[DataType, InternalRow ⇒ CellGrid] = { + case _: TileUDT ⇒ + (row: InternalRow) ⇒ row.to[Tile](TileUDT.tileSerializer) + case _: RasterSourceUDT ⇒ + (row: InternalRow) ⇒ row.to[RasterSource](RasterSourceUDT.rasterSourceSerializer) + case t if t.conformsTo(CatalystSerializer[RasterRef].schema) ⇒ + (row: InternalRow) ⇒ row.to[RasterRef] + } + + sealed trait TileOrNumberArg + sealed trait NumberArg extends TileOrNumberArg + case class TileArg(tile: Tile, ctx: Option[TileContext]) extends TileOrNumberArg + case class DoubleArg(value: Double) extends NumberArg + case class IntegerArg(value: Int) extends NumberArg + + lazy val tileOrNumberExtractor: PartialFunction[DataType, Any => TileOrNumberArg] = + tileArgExtractor.orElse(numberArgExtractor) + + lazy val tileArgExtractor: PartialFunction[DataType, Any => TileArg] = { + case t if tileExtractor.isDefinedAt(t) => { + case ir: InternalRow => + val (tile, ctx) = tileExtractor(t)(ir) + TileArg(tile, ctx) + } + } + + lazy val numberArgExtractor: PartialFunction[DataType, Any => NumberArg] = + doubleArgExtractor.orElse(intArgExtractor) + + lazy val doubleArgExtractor: PartialFunction[DataType, Any => DoubleArg] = { + case _: DoubleType | _: FloatType | _: DecimalType => { + case d: Double => DoubleArg(d) + case f: Float => DoubleArg(f.toDouble) + case d: Decimal => DoubleArg(d.toDouble) + } + } + + lazy val intArgExtractor: PartialFunction[DataType, Any => IntegerArg] = { + case _: IntegerType | _: ByteType | _: ShortType => { + case i: Int => IntegerArg(i) + case b: Byte => IntegerArg(b) + case s: Short => IntegerArg(s.toInt) + case c: Char => IntegerArg(c.toInt) + } + } +} diff --git a/core/src/main/scala/astraea/spark/rasterframes/expressions/GeomDeserializerSupport.scala b/core/src/main/scala/astraea/spark/rasterframes/expressions/GeomDeserializerSupport.scala deleted file mode 100644 index ce17b3430..000000000 --- a/core/src/main/scala/astraea/spark/rasterframes/expressions/GeomDeserializerSupport.scala +++ /dev/null @@ -1,23 +0,0 @@ -package astraea.spark.rasterframes.expressions - -import com.vividsolutions.jts.geom.Geometry -import org.apache.spark.sql.catalyst.InternalRow -import org.apache.spark.sql.catalyst.expressions.Expression -import org.apache.spark.sql.jts.AbstractGeometryUDT - -/** - * Support for deserializing JTS geometry inside expressions. - * - * @since 2/22/18 - */ -trait GeomDeserializerSupport { - def extractGeometry(expr: Expression, input: Any): Geometry = { - input match { - case g: Geometry ⇒ g - case r: InternalRow ⇒ - expr.dataType match { - case udt: AbstractGeometryUDT[_] ⇒ udt.deserialize(r) - } - } - } -} diff --git a/core/src/main/scala/astraea/spark/rasterframes/expressions/GetEnvelope.scala b/core/src/main/scala/astraea/spark/rasterframes/expressions/GetEnvelope.scala deleted file mode 100644 index bdecb45cd..000000000 --- a/core/src/main/scala/astraea/spark/rasterframes/expressions/GetEnvelope.scala +++ /dev/null @@ -1,36 +0,0 @@ -package astraea.spark.rasterframes.expressions -import astraea.spark.rasterframes.encoders.EnvelopeEncoder -import com.vividsolutions.jts.geom.Envelope -import org.apache.spark.sql.catalyst.InternalRow -import org.apache.spark.sql.catalyst.expressions.codegen.CodegenFallback -import org.apache.spark.sql.catalyst.expressions.{Expression, UnaryExpression} -import org.apache.spark.sql.rf._ -import org.apache.spark.sql.types._ -import org.apache.spark.sql.{Column, TypedColumn} - - -/** - * Extracts the bounding box (envelope) of arbitrary JTS Geometry. - * - * @since 2/22/18 - */ -@deprecated("Replace usages of this with GeometryToBounds", "11/4/2018") -case class GetEnvelope(child: Expression) extends UnaryExpression - with CodegenFallback with GeomDeserializerSupport { - - override def nodeName: String = "envelope" - - override protected def nullSafeEval(input: Any): Any = { - val geom = extractGeometry(child, input) - val env = geom.getEnvelopeInternal - InternalRow(env.getMinX, env.getMaxX, env.getMinY, env.getMaxY) - } - - def dataType: DataType = EnvelopeEncoder.schema -} - -object GetEnvelope { - import astraea.spark.rasterframes.encoders.StandardEncoders._ - def apply(col: Column): TypedColumn[Any, Envelope] = - new GetEnvelope(col.expr).asColumn.as[Envelope] -} diff --git a/core/src/main/scala/astraea/spark/rasterframes/expressions/NullToValue.scala b/core/src/main/scala/astraea/spark/rasterframes/expressions/NullToValue.scala new file mode 100644 index 000000000..edc52fcf7 --- /dev/null +++ b/core/src/main/scala/astraea/spark/rasterframes/expressions/NullToValue.scala @@ -0,0 +1,41 @@ +/* + * This software is licensed under the Apache 2 license, quoted below. + * + * Copyright 2019 Astraea, Inc. + * + * Licensed under the Apache License, Version 2.0 (the "License"); you may not + * use this file except in compliance with the License. You may obtain a copy of + * the License at + * + * [http://www.apache.org/licenses/LICENSE-2.0] + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT + * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the + * License for the specific language governing permissions and limitations under + * the License. + * + * SPDX-License-Identifier: Apache-2.0 + * + */ + +package astraea.spark.rasterframes.expressions +import org.apache.spark.sql.catalyst.InternalRow +import org.apache.spark.sql.catalyst.expressions.UnaryExpression + +trait NullToValue { self: UnaryExpression => + + def na: Any + + override def eval(input: InternalRow): Any = { + if (input == null) na + else { + val value = child.eval(input) + if (value == null) { + na + } else { + nullSafeEval(value) + } + } + } +} diff --git a/core/src/main/scala/astraea/spark/rasterframes/expressions/OnCellGridExpression.scala b/core/src/main/scala/astraea/spark/rasterframes/expressions/OnCellGridExpression.scala index 482e300d2..b856ae2be 100644 --- a/core/src/main/scala/astraea/spark/rasterframes/expressions/OnCellGridExpression.scala +++ b/core/src/main/scala/astraea/spark/rasterframes/expressions/OnCellGridExpression.scala @@ -21,16 +21,12 @@ package astraea.spark.rasterframes.expressions -import astraea.spark.rasterframes.encoders.CatalystSerializer -import astraea.spark.rasterframes.encoders.CatalystSerializer._ -import astraea.spark.rasterframes.ref.{RasterRef, RasterSource} -import geotrellis.raster.{CellGrid, Grid, Tile} +import astraea.spark.rasterframes.expressions.DynamicExtractors._ +import geotrellis.raster.CellGrid import org.apache.spark.sql.catalyst.InternalRow import org.apache.spark.sql.catalyst.analysis.TypeCheckResult import org.apache.spark.sql.catalyst.analysis.TypeCheckResult.{TypeCheckFailure, TypeCheckSuccess} import org.apache.spark.sql.catalyst.expressions.UnaryExpression -import org.apache.spark.sql.rf._ -import org.apache.spark.sql.types.DataType /** * Implements boilerplate for subtype expressions processing TileUDT, RasterSourceUDT, and RasterRefs @@ -39,19 +35,8 @@ import org.apache.spark.sql.types.DataType * @since 11/4/18 */ trait OnCellGridExpression extends UnaryExpression { - // TODO: DRY w.r.t. OnProjectedRasterExpression.... - - private val toGrid: PartialFunction[DataType, InternalRow ⇒ CellGrid] = { - case _: TileUDT ⇒ - (row: InternalRow) ⇒ row.to[Tile] - case _: RasterSourceUDT ⇒ - (row: InternalRow) ⇒ row.to[RasterSource] - case t if t.conformsTo(CatalystSerializer[RasterRef].schema) ⇒ - (row: InternalRow) ⇒ row.to[RasterRef] - } - override def checkInputDataTypes(): TypeCheckResult = { - if (!toGrid.isDefinedAt(child.dataType)) { + if (!gridExtractor.isDefinedAt(child.dataType)) { TypeCheckFailure(s"Input type '${child.dataType}' does not conform to `Grid`.") } else TypeCheckSuccess @@ -60,7 +45,7 @@ trait OnCellGridExpression extends UnaryExpression { final override protected def nullSafeEval(input: Any): Any = { input match { case row: InternalRow ⇒ - val g = toGrid(child.dataType)(row) + val g = gridExtractor(child.dataType)(row) eval(g) case o ⇒ throw new IllegalArgumentException(s"Unsupported input type: $o") } diff --git a/core/src/main/scala/astraea/spark/rasterframes/expressions/OnProjectedRasterExpression.scala b/core/src/main/scala/astraea/spark/rasterframes/expressions/OnTileContextExpression.scala similarity index 58% rename from core/src/main/scala/astraea/spark/rasterframes/expressions/OnProjectedRasterExpression.scala rename to core/src/main/scala/astraea/spark/rasterframes/expressions/OnTileContextExpression.scala index f857ff852..a8797ae49 100644 --- a/core/src/main/scala/astraea/spark/rasterframes/expressions/OnProjectedRasterExpression.scala +++ b/core/src/main/scala/astraea/spark/rasterframes/expressions/OnTileContextExpression.scala @@ -21,17 +21,12 @@ package astraea.spark.rasterframes.expressions -import astraea.spark.rasterframes.encoders.CatalystSerializer -import astraea.spark.rasterframes.encoders.CatalystSerializer._ -import astraea.spark.rasterframes.ref.{ProjectedRasterLike, RasterRef, RasterSource} -import astraea.spark.rasterframes.tiles.ProjectedRasterTile -import geotrellis.raster.Tile +import astraea.spark.rasterframes.expressions.DynamicExtractors._ +import astraea.spark.rasterframes.model.TileContext import org.apache.spark.sql.catalyst.InternalRow import org.apache.spark.sql.catalyst.analysis.TypeCheckResult import org.apache.spark.sql.catalyst.analysis.TypeCheckResult.{TypeCheckFailure, TypeCheckSuccess} import org.apache.spark.sql.catalyst.expressions.UnaryExpression -import org.apache.spark.sql.rf._ -import org.apache.spark.sql.types.DataType /** * Implements boilerplate for subtype expressions processing TileUDT (when ProjectedRasterTile), RasterSourceUDT, and @@ -39,26 +34,10 @@ import org.apache.spark.sql.types.DataType * * @since 11/3/18 */ -trait OnProjectedRasterExpression extends UnaryExpression { - - private val toPRL: PartialFunction[DataType, InternalRow ⇒ ProjectedRasterLike] = { - case _: TileUDT ⇒ - (row: InternalRow) ⇒ { - val tile = row.to[Tile] - tile match { - case pr: ProjectedRasterTile ⇒ pr - // TODO: don't let match error happen. Refactor this sub case up a level. - // Not sure how to do do it since we're returning functions that are evaluated later. - } - } - case _: RasterSourceUDT ⇒ - (row: InternalRow) ⇒ row.to[RasterSource] - case t if t.conformsTo(CatalystSerializer[RasterRef].schema) ⇒ - (row: InternalRow) ⇒ row.to[RasterRef] - } +trait OnTileContextExpression extends UnaryExpression { override def checkInputDataTypes(): TypeCheckResult = { - if (!toPRL.isDefinedAt(child.dataType)) { + if (!projectedRasterLikeExtractor.isDefinedAt(child.dataType)) { TypeCheckFailure(s"Input type '${child.dataType}' does not conform to `ProjectedRasterLike`.") } else TypeCheckSuccess @@ -67,13 +46,12 @@ trait OnProjectedRasterExpression extends UnaryExpression { final override protected def nullSafeEval(input: Any): Any = { input match { case row: InternalRow ⇒ - val prl = toPRL(child.dataType)(row) - eval(prl) + val prl = projectedRasterLikeExtractor(child.dataType)(row) + eval(TileContext(prl.extent, prl.crs)) case o ⇒ throw new IllegalArgumentException(s"Unsupported input type: $o") } } /** Implemented by subtypes to process incoming ProjectedRasterLike entity. */ - def eval(prl: ProjectedRasterLike): Any - + def eval(ctx: TileContext): Any } diff --git a/core/src/main/scala/astraea/spark/rasterframes/expressions/RequiresTile.scala b/core/src/main/scala/astraea/spark/rasterframes/expressions/RequiresTile.scala deleted file mode 100644 index ee11744d0..000000000 --- a/core/src/main/scala/astraea/spark/rasterframes/expressions/RequiresTile.scala +++ /dev/null @@ -1,42 +0,0 @@ -/* - * This software is licensed under the Apache 2 license, quoted below. - * - * Copyright 2017 Astraea, Inc. - * - * Licensed under the Apache License, Version 2.0 (the "License"); you may not - * use this file except in compliance with the License. You may obtain a copy of - * the License at - * - * [http://www.apache.org/licenses/LICENSE-2.0] - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT - * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the - * License for the specific language governing permissions and limitations under - * the License. - * - */ - -package astraea.spark.rasterframes.expressions - -import org.apache.spark.sql.catalyst.analysis.TypeCheckResult -import org.apache.spark.sql.catalyst.analysis.TypeCheckResult.{TypeCheckFailure, TypeCheckSuccess} -import org.apache.spark.sql.catalyst.expressions.{Expression, UnaryExpression} -import org.apache.spark.sql.rf.TileUDT - -/** - * Mixin for indicating an expression requires a Tile for input. - * - * @since 12/28/17 - */ -trait RequiresTile { self: UnaryExpression ⇒ - abstract override def checkInputDataTypes(): TypeCheckResult = RequiresTile.check(child) -} - -object RequiresTile { - def check(expr: Expression): TypeCheckResult = - if(expr.dataType.isInstanceOf[TileUDT]) TypeCheckSuccess - else TypeCheckFailure( - s"Expected 'TileUDT' but received '${expr.dataType.simpleString}'" - ) -} diff --git a/core/src/main/scala/astraea/spark/rasterframes/expressions/SpatialRelation.scala b/core/src/main/scala/astraea/spark/rasterframes/expressions/SpatialRelation.scala index dde85defc..e994c8a64 100644 --- a/core/src/main/scala/astraea/spark/rasterframes/expressions/SpatialRelation.scala +++ b/core/src/main/scala/astraea/spark/rasterframes/expressions/SpatialRelation.scala @@ -21,21 +21,32 @@ package astraea.spark.rasterframes.expressions import astraea.spark.rasterframes.expressions.SpatialRelation.RelationPredicate import com.vividsolutions.jts.geom._ +import org.apache.spark.sql.catalyst.InternalRow import org.apache.spark.sql.catalyst.encoders.ExpressionEncoder import org.apache.spark.sql.catalyst.expressions.codegen.CodegenFallback import org.apache.spark.sql.catalyst.expressions.{ScalaUDF, _} +import org.apache.spark.sql.jts.AbstractGeometryUDT import org.apache.spark.sql.types._ import org.locationtech.geomesa.spark.jts.udf.SpatialRelationFunctions._ - - /** * Determine if two spatial constructs intersect each other. * * @since 12/28/17 */ abstract class SpatialRelation extends BinaryExpression - with CodegenFallback with GeomDeserializerSupport { + with CodegenFallback { + + def extractGeometry(expr: Expression, input: Any): Geometry = { + input match { + case g: Geometry ⇒ g + case r: InternalRow ⇒ + expr.dataType match { + case udt: AbstractGeometryUDT[_] ⇒ udt.deserialize(r) + } + } + } + // TODO: replace with serializer. lazy val jtsPointEncoder = ExpressionEncoder[Point]() override def toString: String = s"$nodeName($left, $right)" diff --git a/core/src/main/scala/astraea/spark/rasterframes/functions/TileAssembler.scala b/core/src/main/scala/astraea/spark/rasterframes/expressions/TileAssembler.scala similarity index 92% rename from core/src/main/scala/astraea/spark/rasterframes/functions/TileAssembler.scala rename to core/src/main/scala/astraea/spark/rasterframes/expressions/TileAssembler.scala index 6c55982c3..c3a32267f 100644 --- a/core/src/main/scala/astraea/spark/rasterframes/functions/TileAssembler.scala +++ b/core/src/main/scala/astraea/spark/rasterframes/expressions/TileAssembler.scala @@ -1,7 +1,7 @@ /* * This software is licensed under the Apache 2 license, quoted below. * - * Copyright 2017 Astraea, Inc. + * Copyright 2019 Astraea, Inc. * * Licensed under the Apache License, Version 2.0 (the "License"); you may not * use this file except in compliance with the License. You may obtain a copy of @@ -15,21 +15,20 @@ * License for the specific language governing permissions and limitations under * the License. * + * SPDX-License-Identifier: Apache-2.0 + * */ -package astraea.spark.rasterframes.functions +package astraea.spark.rasterframes.expressions + +import java.nio.ByteBuffer -import java.nio.{ByteBuffer, DoubleBuffer} -import astraea.spark.rasterframes.encoders._ +import astraea.spark.rasterframes.expressions.TileAssembler.TileBuffer import astraea.spark.rasterframes.util._ -import astraea.spark.rasterframes.NOMINAL_TILE_SIZE -import astraea.spark.rasterframes.functions.TileAssembler.TileBuffer import geotrellis.raster.{DataType => _, _} import org.apache.spark.sql.catalyst.InternalRow -import org.apache.spark.sql.catalyst.expressions.aggregate.{ - ImperativeAggregate, TypedImperativeAggregate -} -import org.apache.spark.sql.catalyst.expressions.{Expression, ImplicitCastInputTypes, Literal} +import org.apache.spark.sql.catalyst.expressions.aggregate.{ImperativeAggregate, TypedImperativeAggregate} +import org.apache.spark.sql.catalyst.expressions.{Expression, ImplicitCastInputTypes} import org.apache.spark.sql.rf.TileUDT import org.apache.spark.sql.types._ import org.apache.spark.sql.{Column, TypedColumn} diff --git a/core/src/main/scala/astraea/spark/rasterframes/expressions/UnaryLocalRasterOp.scala b/core/src/main/scala/astraea/spark/rasterframes/expressions/UnaryLocalRasterOp.scala new file mode 100644 index 000000000..049e6d9a1 --- /dev/null +++ b/core/src/main/scala/astraea/spark/rasterframes/expressions/UnaryLocalRasterOp.scala @@ -0,0 +1,58 @@ +/* + * This software is licensed under the Apache 2 license, quoted below. + * + * Copyright 2019 Astraea, Inc. + * + * Licensed under the Apache License, Version 2.0 (the "License"); you may not + * use this file except in compliance with the License. You may obtain a copy of + * the License at + * + * [http://www.apache.org/licenses/LICENSE-2.0] + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT + * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the + * License for the specific language governing permissions and limitations under + * the License. + * + * SPDX-License-Identifier: Apache-2.0 + * + */ + +package astraea.spark.rasterframes.expressions + +import astraea.spark.rasterframes.encoders.CatalystSerializer._ +import astraea.spark.rasterframes.expressions.DynamicExtractors._ +import com.typesafe.scalalogging.LazyLogging +import geotrellis.raster.Tile +import org.apache.spark.sql.catalyst.analysis.TypeCheckResult +import org.apache.spark.sql.catalyst.analysis.TypeCheckResult.{TypeCheckFailure, TypeCheckSuccess} +import org.apache.spark.sql.catalyst.expressions.UnaryExpression +import org.apache.spark.sql.rf.TileUDT +import org.apache.spark.sql.types.DataType + +/** Operation on a tile returning a tile. */ +trait UnaryLocalRasterOp extends UnaryExpression with LazyLogging { + + override def dataType: DataType = child.dataType + + override def checkInputDataTypes(): TypeCheckResult = { + if (!tileExtractor.isDefinedAt(child.dataType)) { + TypeCheckFailure(s"Input type '${child.dataType}' does not conform to a raster type.") + } + else TypeCheckSuccess + } + + override protected def nullSafeEval(input: Any): Any = { + implicit val tileSer = TileUDT.tileSerializer + val (childTile, childCtx) = tileExtractor(child.dataType)(row(input)) + + childCtx match { + case Some(ctx) => ctx.toProjectRasterTile(op(childTile)).toInternalRow + case None => op(childTile).toInternalRow + } + } + + protected def op(child: Tile): Tile +} + diff --git a/core/src/main/scala/astraea/spark/rasterframes/expressions/UnaryRasterAggregate.scala b/core/src/main/scala/astraea/spark/rasterframes/expressions/UnaryRasterAggregate.scala new file mode 100644 index 000000000..a28ae6753 --- /dev/null +++ b/core/src/main/scala/astraea/spark/rasterframes/expressions/UnaryRasterAggregate.scala @@ -0,0 +1,45 @@ +/* + * This software is licensed under the Apache 2 license, quoted below. + * + * Copyright 2019 Astraea, Inc. + * + * Licensed under the Apache License, Version 2.0 (the "License"); you may not + * use this file except in compliance with the License. You may obtain a copy of + * the License at + * + * [http://www.apache.org/licenses/LICENSE-2.0] + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT + * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the + * License for the specific language governing permissions and limitations under + * the License. + * + * SPDX-License-Identifier: Apache-2.0 + * + */ + +package astraea.spark.rasterframes.expressions +import astraea.spark.rasterframes.expressions.DynamicExtractors.rowTileExtractor +import geotrellis.raster.Tile +import org.apache.spark.sql.Row +import org.apache.spark.sql.catalyst.expressions.{Expression, ScalaUDF} +import org.apache.spark.sql.catalyst.expressions.aggregate.DeclarativeAggregate +import scala.reflect.runtime.universe._ + +/** Mixin providing boilerplate for DeclarativeAggrates over tile-conforming columns. */ +trait UnaryRasterAggregate extends DeclarativeAggregate { + def child: Expression + + def nullable: Boolean = child.nullable + + def children = Seq(child) + + protected def tileOpAsExpression[R: TypeTag](name: String, op: Tile => R): Expression => ScalaUDF = + udfexpr[R, Any](name, (a: Any) => op(extractTileFromAny(a))) + + protected val extractTileFromAny = (a: Any) => a match { + case t: Tile => t + case r: Row => rowTileExtractor(child.dataType)(r)._1 + } +} diff --git a/core/src/main/scala/astraea/spark/rasterframes/expressions/UnaryRasterOp.scala b/core/src/main/scala/astraea/spark/rasterframes/expressions/UnaryRasterOp.scala new file mode 100644 index 000000000..f21dc4bb5 --- /dev/null +++ b/core/src/main/scala/astraea/spark/rasterframes/expressions/UnaryRasterOp.scala @@ -0,0 +1,46 @@ +/* + * This software is licensed under the Apache 2 license, quoted below. + * + * Copyright 2019 Astraea, Inc. + * + * Licensed under the Apache License, Version 2.0 (the "License"); you may not + * use this file except in compliance with the License. You may obtain a copy of + * the License at + * + * [http://www.apache.org/licenses/LICENSE-2.0] + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT + * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the + * License for the specific language governing permissions and limitations under + * the License. + * + * SPDX-License-Identifier: Apache-2.0 + * + */ + +package astraea.spark.rasterframes.expressions +import astraea.spark.rasterframes.expressions.DynamicExtractors._ +import astraea.spark.rasterframes.model.TileContext +import geotrellis.raster.Tile +import org.apache.spark.sql.catalyst.analysis.TypeCheckResult +import org.apache.spark.sql.catalyst.analysis.TypeCheckResult.{TypeCheckFailure, TypeCheckSuccess} +import org.apache.spark.sql.catalyst.expressions.UnaryExpression + +/** Boilerplate for expressions operating on a single Tile-like . */ +trait UnaryRasterOp extends UnaryExpression { + override def checkInputDataTypes(): TypeCheckResult = { + if (!tileExtractor.isDefinedAt(child.dataType)) { + TypeCheckFailure(s"Input type '${child.dataType}' does not conform to a raster type.") + } else TypeCheckSuccess + } + + override protected def nullSafeEval(input: Any): Any = { + // TODO: Ensure InternalRowTile is preserved + val (tile, ctx) = tileExtractor(child.dataType)(row(input)) + eval(tile, ctx) + } + + protected def eval(tile: Tile, ctx: Option[TileContext]): Any +} + diff --git a/core/src/main/scala/astraea/spark/rasterframes/expressions/accessors/ExtractTile.scala b/core/src/main/scala/astraea/spark/rasterframes/expressions/accessors/ExtractTile.scala new file mode 100644 index 000000000..7cb7ba3b1 --- /dev/null +++ b/core/src/main/scala/astraea/spark/rasterframes/expressions/accessors/ExtractTile.scala @@ -0,0 +1,53 @@ +/* + * This software is licensed under the Apache 2 license, quoted below. + * + * Copyright 2019 Astraea, Inc. + * + * Licensed under the Apache License, Version 2.0 (the "License"); you may not + * use this file except in compliance with the License. You may obtain a copy of + * the License at + * + * [http://www.apache.org/licenses/LICENSE-2.0] + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT + * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the + * License for the specific language governing permissions and limitations under + * the License. + * + * SPDX-License-Identifier: Apache-2.0 + * + */ + +package astraea.spark.rasterframes.expressions.accessors + +import astraea.spark.rasterframes.encoders.CatalystSerializer._ +import astraea.spark.rasterframes.expressions.UnaryRasterOp +import astraea.spark.rasterframes.model.TileContext +import astraea.spark.rasterframes.tiles.InternalRowTile +import astraea.spark.rasterframes.tiles.ProjectedRasterTile.ConcreteProjectedRasterTile +import geotrellis.raster.Tile +import org.apache.spark.sql.catalyst.expressions.Expression +import org.apache.spark.sql.catalyst.expressions.codegen.CodegenFallback +import org.apache.spark.sql.rf.TileUDT +import org.apache.spark.sql.types.DataType +import org.apache.spark.sql.{Column, TypedColumn} + +/** Expression to extract at tile from several types that contain tiles.*/ +case class ExtractTile(child: Expression) extends UnaryRasterOp with CodegenFallback { + override def dataType: DataType = new TileUDT() + + override def nodeName: String = "extract_tile" + implicit val tileSer = TileUDT.tileSerializer + override protected def eval(tile: Tile, ctx: Option[TileContext]): Any = tile match { + case irt: InternalRowTile => irt.mem + case tile: ConcreteProjectedRasterTile => tile.t.toInternalRow + case tile: Tile => tile.toInternalRow + } +} + +object ExtractTile { + import astraea.spark.rasterframes.encoders.StandardEncoders.singlebandTileEncoder + def apply(input: Column): TypedColumn[Any, Tile] = + new Column(new ExtractTile(input.expr)).as[Tile] +} diff --git a/core/src/main/scala/astraea/spark/rasterframes/expressions/GetCRS.scala b/core/src/main/scala/astraea/spark/rasterframes/expressions/accessors/GetCRS.scala similarity index 80% rename from core/src/main/scala/astraea/spark/rasterframes/expressions/GetCRS.scala rename to core/src/main/scala/astraea/spark/rasterframes/expressions/accessors/GetCRS.scala index 20974b891..1a6d29df0 100644 --- a/core/src/main/scala/astraea/spark/rasterframes/expressions/GetCRS.scala +++ b/core/src/main/scala/astraea/spark/rasterframes/expressions/accessors/GetCRS.scala @@ -1,7 +1,7 @@ /* * This software is licensed under the Apache 2 license, quoted below. * - * Copyright 2018 Astraea, Inc. + * Copyright 2019 Astraea, Inc. * * Licensed under the Apache License, Version 2.0 (the "License"); you may not * use this file except in compliance with the License. You may obtain a copy of @@ -19,17 +19,17 @@ * */ -package astraea.spark.rasterframes.expressions +package astraea.spark.rasterframes.expressions.accessors import astraea.spark.rasterframes.encoders.CatalystSerializer import astraea.spark.rasterframes.encoders.CatalystSerializer._ import astraea.spark.rasterframes.encoders.StandardEncoders.crsEncoder -import astraea.spark.rasterframes.ref.ProjectedRasterLike +import astraea.spark.rasterframes.expressions.OnTileContextExpression +import astraea.spark.rasterframes.model.TileContext import geotrellis.proj4.CRS import org.apache.spark.sql.catalyst.InternalRow import org.apache.spark.sql.catalyst.expressions._ import org.apache.spark.sql.catalyst.expressions.codegen.CodegenFallback -import org.apache.spark.sql.rf._ import org.apache.spark.sql.types.DataType import org.apache.spark.sql.{Column, TypedColumn} @@ -38,10 +38,10 @@ import org.apache.spark.sql.{Column, TypedColumn} * * @since 9/9/18 */ -case class GetCRS(child: Expression) extends OnProjectedRasterExpression with CodegenFallback { +case class GetCRS(child: Expression) extends OnTileContextExpression with CodegenFallback { override def dataType: DataType = CatalystSerializer[CRS].schema override def nodeName: String = "crs" - override def eval(prl: ProjectedRasterLike): InternalRow = prl.crs.toInternalRow + override def eval(ctx: TileContext): InternalRow = ctx.crs.toInternalRow } object GetCRS { diff --git a/core/src/main/scala/astraea/spark/rasterframes/expressions/GetCellType.scala b/core/src/main/scala/astraea/spark/rasterframes/expressions/accessors/GetCellType.scala similarity index 89% rename from core/src/main/scala/astraea/spark/rasterframes/expressions/GetCellType.scala rename to core/src/main/scala/astraea/spark/rasterframes/expressions/accessors/GetCellType.scala index 34b723631..eeb521e4b 100644 --- a/core/src/main/scala/astraea/spark/rasterframes/expressions/GetCellType.scala +++ b/core/src/main/scala/astraea/spark/rasterframes/expressions/accessors/GetCellType.scala @@ -1,7 +1,7 @@ /* * This software is licensed under the Apache 2 license, quoted below. * - * Copyright 2017 Astraea, Inc. + * Copyright 2019 Astraea, Inc. * * Licensed under the Apache License, Version 2.0 (the "License"); you may not * use this file except in compliance with the License. You may obtain a copy of @@ -15,12 +15,15 @@ * License for the specific language governing permissions and limitations under * the License. * + * SPDX-License-Identifier: Apache-2.0 + * */ -package astraea.spark.rasterframes.expressions +package astraea.spark.rasterframes.expressions.accessors import astraea.spark.rasterframes.encoders.CatalystSerializer import astraea.spark.rasterframes.encoders.CatalystSerializer._ +import astraea.spark.rasterframes.expressions.OnCellGridExpression import geotrellis.raster.{CellGrid, CellType} import org.apache.spark.sql.catalyst.expressions.Expression import org.apache.spark.sql.catalyst.expressions.codegen.CodegenFallback diff --git a/core/src/main/scala/astraea/spark/rasterframes/expressions/GetDimensions.scala b/core/src/main/scala/astraea/spark/rasterframes/expressions/accessors/GetDimensions.scala similarity index 85% rename from core/src/main/scala/astraea/spark/rasterframes/expressions/GetDimensions.scala rename to core/src/main/scala/astraea/spark/rasterframes/expressions/accessors/GetDimensions.scala index caba2de23..3589dbc1b 100644 --- a/core/src/main/scala/astraea/spark/rasterframes/expressions/GetDimensions.scala +++ b/core/src/main/scala/astraea/spark/rasterframes/expressions/accessors/GetDimensions.scala @@ -1,7 +1,7 @@ /* * This software is licensed under the Apache 2 license, quoted below. * - * Copyright 2017 Astraea, Inc. + * Copyright 2019 Astraea, Inc. * * Licensed under the Apache License, Version 2.0 (the "License"); you may not * use this file except in compliance with the License. You may obtain a copy of @@ -15,13 +15,16 @@ * License for the specific language governing permissions and limitations under * the License. * + * SPDX-License-Identifier: Apache-2.0 + * */ -package astraea.spark.rasterframes.expressions +package astraea.spark.rasterframes.expressions.accessors -import astraea.spark.rasterframes.TileDimensions import astraea.spark.rasterframes.encoders.CatalystSerializer import astraea.spark.rasterframes.encoders.CatalystSerializer._ +import astraea.spark.rasterframes.expressions.OnCellGridExpression +import astraea.spark.rasterframes.model.TileDimensions import geotrellis.raster.CellGrid import org.apache.spark.sql._ import org.apache.spark.sql.catalyst.expressions.Expression @@ -41,7 +44,6 @@ case class GetDimensions(child: Expression) extends OnCellGridExpression } object GetDimensions { - import astraea.spark.rasterframes.encoders.SparkDefaultEncoders._ def apply(col: Column): Column = new Column(new GetDimensions(col.expr)).as[TileDimensions] } diff --git a/core/src/main/scala/astraea/spark/rasterframes/expressions/accessors/GetEnvelope.scala b/core/src/main/scala/astraea/spark/rasterframes/expressions/accessors/GetEnvelope.scala new file mode 100644 index 000000000..551f64eb0 --- /dev/null +++ b/core/src/main/scala/astraea/spark/rasterframes/expressions/accessors/GetEnvelope.scala @@ -0,0 +1,66 @@ +/* + * This software is licensed under the Apache 2 license, quoted below. + * + * Copyright 2019 Astraea, Inc. + * + * Licensed under the Apache License, Version 2.0 (the "License"); you may not + * use this file except in compliance with the License. You may obtain a copy of + * the License at + * + * [http://www.apache.org/licenses/LICENSE-2.0] + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT + * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the + * License for the specific language governing permissions and limitations under + * the License. + * + * SPDX-License-Identifier: Apache-2.0 + * + */ + +package astraea.spark.rasterframes.expressions.accessors + +import astraea.spark.rasterframes.encoders.EnvelopeEncoder +import com.vividsolutions.jts.geom.{Envelope, Geometry} +import org.apache.spark.sql.catalyst.InternalRow +import org.apache.spark.sql.catalyst.expressions.codegen.CodegenFallback +import org.apache.spark.sql.catalyst.expressions.{Expression, UnaryExpression} +import org.apache.spark.sql.jts.AbstractGeometryUDT +import org.apache.spark.sql.rf._ +import org.apache.spark.sql.types._ +import org.apache.spark.sql.{Column, TypedColumn} + +/** + * Extracts the bounding box (envelope) of arbitrary JTS Geometry. + * + * @since 2/22/18 + */ +@deprecated("Replace usages of this with GeometryToBounds", "11/4/2018") +case class GetEnvelope(child: Expression) extends UnaryExpression with CodegenFallback { + + override def nodeName: String = "envelope" + def extractGeometry(expr: Expression, input: Any): Geometry = { + input match { + case g: Geometry => g + case r: InternalRow => + expr.dataType match { + case udt: AbstractGeometryUDT[_] => udt.deserialize(r) + } + } + } + + override protected def nullSafeEval(input: Any): Any = { + val geom = extractGeometry(child, input) + val env = geom.getEnvelopeInternal + InternalRow(env.getMinX, env.getMaxX, env.getMinY, env.getMaxY) + } + + def dataType: DataType = EnvelopeEncoder.schema +} + +object GetEnvelope { + import astraea.spark.rasterframes.encoders.StandardEncoders._ + def apply(col: Column): TypedColumn[Any, Envelope] = + new GetEnvelope(col.expr).asColumn.as[Envelope] +} diff --git a/core/src/main/scala/astraea/spark/rasterframes/expressions/GetExtent.scala b/core/src/main/scala/astraea/spark/rasterframes/expressions/accessors/GetExtent.scala similarity index 80% rename from core/src/main/scala/astraea/spark/rasterframes/expressions/GetExtent.scala rename to core/src/main/scala/astraea/spark/rasterframes/expressions/accessors/GetExtent.scala index 71f47a053..c3e664887 100644 --- a/core/src/main/scala/astraea/spark/rasterframes/expressions/GetExtent.scala +++ b/core/src/main/scala/astraea/spark/rasterframes/expressions/accessors/GetExtent.scala @@ -1,7 +1,7 @@ /* * This software is licensed under the Apache 2 license, quoted below. * - * Copyright 2018 Astraea, Inc. + * Copyright 2019 Astraea, Inc. * * Licensed under the Apache License, Version 2.0 (the "License"); you may not * use this file except in compliance with the License. You may obtain a copy of @@ -19,12 +19,13 @@ * */ -package astraea.spark.rasterframes.expressions +package astraea.spark.rasterframes.expressions.accessors import astraea.spark.rasterframes.encoders.CatalystSerializer import astraea.spark.rasterframes.encoders.CatalystSerializer._ import astraea.spark.rasterframes.encoders.StandardEncoders.extentEncoder -import astraea.spark.rasterframes.ref.ProjectedRasterLike +import astraea.spark.rasterframes.expressions.OnTileContextExpression +import astraea.spark.rasterframes.model.TileContext import geotrellis.vector.Extent import org.apache.spark.sql.catalyst.InternalRow import org.apache.spark.sql.catalyst.expressions._ @@ -37,10 +38,10 @@ import org.apache.spark.sql.{Column, TypedColumn} * * @since 9/10/18 */ -case class GetExtent(child: Expression) extends OnProjectedRasterExpression with CodegenFallback { +case class GetExtent(child: Expression) extends OnTileContextExpression with CodegenFallback { override def dataType: DataType = CatalystSerializer[Extent].schema override def nodeName: String = "extent" - override def eval(prl: ProjectedRasterLike): InternalRow = prl.extent.toInternalRow + override def eval(ctx: TileContext): InternalRow = ctx.extent.toInternalRow } object GetExtent { diff --git a/core/src/main/scala/astraea/spark/rasterframes/expressions/accessors/GetTileContext.scala b/core/src/main/scala/astraea/spark/rasterframes/expressions/accessors/GetTileContext.scala new file mode 100644 index 000000000..98b7eb401 --- /dev/null +++ b/core/src/main/scala/astraea/spark/rasterframes/expressions/accessors/GetTileContext.scala @@ -0,0 +1,46 @@ +/* + * This software is licensed under the Apache 2 license, quoted below. + * + * Copyright 2019 Astraea, Inc. + * + * Licensed under the Apache License, Version 2.0 (the "License"); you may not + * use this file except in compliance with the License. You may obtain a copy of + * the License at + * + * [http://www.apache.org/licenses/LICENSE-2.0] + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT + * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the + * License for the specific language governing permissions and limitations under + * the License. + * + * SPDX-License-Identifier: Apache-2.0 + * + */ + +package astraea.spark.rasterframes.expressions.accessors +import astraea.spark.rasterframes.encoders.CatalystSerializer +import astraea.spark.rasterframes.encoders.CatalystSerializer._ +import astraea.spark.rasterframes.expressions.UnaryRasterOp +import astraea.spark.rasterframes.model.TileContext +import geotrellis.raster.Tile +import org.apache.spark.sql.catalyst.expressions.Expression +import org.apache.spark.sql.catalyst.expressions.codegen.CodegenFallback +import org.apache.spark.sql.types.DataType +import org.apache.spark.sql.{Column, TypedColumn} + +case class GetTileContext(child: Expression) extends UnaryRasterOp with CodegenFallback { + override def dataType: DataType = CatalystSerializer[TileContext].schema + + override def nodeName: String = "get_tile_context" + override protected def eval(tile: Tile, ctx: Option[TileContext]): Any = + ctx.map(_.toInternalRow).orNull +} + +object GetTileContext { + import astraea.spark.rasterframes.encoders.StandardEncoders.tileContextEncoder + + def apply(input: Column): TypedColumn[Any, TileContext] = + new Column(new GetTileContext(input.expr)).as[TileContext] +} diff --git a/core/src/main/scala/astraea/spark/rasterframes/expressions/aggstats/CellCountAggregate.scala b/core/src/main/scala/astraea/spark/rasterframes/expressions/aggstats/CellCountAggregate.scala new file mode 100644 index 000000000..0a4424665 --- /dev/null +++ b/core/src/main/scala/astraea/spark/rasterframes/expressions/aggstats/CellCountAggregate.scala @@ -0,0 +1,106 @@ +/* + * This software is licensed under the Apache 2 license, quoted below. + * + * Copyright 2019 Astraea, Inc. + * + * Licensed under the Apache License, Version 2.0 (the "License"); you may not + * use this file except in compliance with the License. You may obtain a copy of + * the License at + * + * [http://www.apache.org/licenses/LICENSE-2.0] + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT + * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the + * License for the specific language governing permissions and limitations under + * the License. + * + * SPDX-License-Identifier: Apache-2.0 + * + */ + +package astraea.spark.rasterframes.expressions.aggstats + +import astraea.spark.rasterframes.expressions.UnaryRasterAggregate +import astraea.spark.rasterframes.expressions.tilestats.{DataCells, NoDataCells} +import org.apache.spark.sql.catalyst.dsl.expressions._ +import org.apache.spark.sql.catalyst.expressions.{AttributeReference, Expression, _} +import org.apache.spark.sql.types.{LongType, Metadata} +import org.apache.spark.sql.{Column, TypedColumn} + +/** + * Cell count (data or NoData) aggregate function. + * + * @since 10/5/17 + * @param isData true if count should be of non-NoData cells, false if count should be of NoData cells. + */ +abstract class CellCountAggregate(isData: Boolean) extends UnaryRasterAggregate { + private lazy val count = + AttributeReference("count", LongType, false, Metadata.empty)() + + override lazy val aggBufferAttributes = Seq( + count + ) + + val initialValues = Seq( + Literal(0L) + ) + + private def CellTest = + if (isData) tileOpAsExpression("data_cells", DataCells.op) + else tileOpAsExpression("no_data_cells", NoDataCells.op) + + val updateExpressions = Seq( + If(IsNull(child), count, Add(count, CellTest(child))) + ) + + val mergeExpressions = Seq( + count.left + count.right + ) + + val evaluateExpression = count + + def dataType = LongType +} + +object CellCountAggregate { + import astraea.spark.rasterframes.encoders.StandardEncoders.PrimitiveEncoders.longEnc + + @ExpressionDescription( + usage = "_FUNC_(tile) - Count the total data (non-no-data) cells in a tile column.", + arguments = """ + Arguments: + * tile - tile column to analyze""", + examples = """ + Examples: + > SELECT _FUNC_(tile); + 92384753""" + ) + case class DataCells(child: Expression) extends CellCountAggregate(true) { + override def nodeName: String = "agg_data_cells" + } + object DataCells { + def apply(tile: Column): TypedColumn[Any, Long] = + new Column(DataCells(tile.expr).toAggregateExpression()).as[Long] + } + @ExpressionDescription( + usage = "_FUNC_(tile) - Count the total no-data cells in a tile column.", + arguments = """ + Arguments: + * tile - tile column to analyze""", + examples = """ + Examples: + > SELECT _FUNC_(tile); + 23584""" + ) + case class NoDataCells(child: Expression) extends CellCountAggregate(false) { + override def nodeName: String = "agg_no_data_cells" + } + object NoDataCells { + def apply(tile: Column): TypedColumn[Any, Long] = + new Column(NoDataCells(tile.expr).toAggregateExpression()).as[Long] + } +} + + + diff --git a/core/src/main/scala/astraea/spark/rasterframes/functions/CellMeanAggregate.scala b/core/src/main/scala/astraea/spark/rasterframes/expressions/aggstats/CellMeanAggregate.scala similarity index 50% rename from core/src/main/scala/astraea/spark/rasterframes/functions/CellMeanAggregate.scala rename to core/src/main/scala/astraea/spark/rasterframes/expressions/aggstats/CellMeanAggregate.scala index f682505a7..846f169cb 100644 --- a/core/src/main/scala/astraea/spark/rasterframes/functions/CellMeanAggregate.scala +++ b/core/src/main/scala/astraea/spark/rasterframes/expressions/aggstats/CellMeanAggregate.scala @@ -1,7 +1,7 @@ /* * This software is licensed under the Apache 2 license, quoted below. * - * Copyright 2017 Astraea, Inc. + * Copyright 2019 Astraea, Inc. * * Licensed under the Apache License, Version 2.0 (the "License"); you may not * use this file except in compliance with the License. You may obtain a copy of @@ -15,27 +15,33 @@ * License for the specific language governing permissions and limitations under * the License. * + * SPDX-License-Identifier: Apache-2.0 + * */ -package astraea.spark.rasterframes.functions +package astraea.spark.rasterframes.expressions.aggstats -import org.apache.spark.sql.{Column, TypedColumn} -import org.apache.spark.sql.catalyst.expressions.{AttributeReference, Expression} -import org.apache.spark.sql.catalyst.expressions.aggregate.DeclarativeAggregate -import org.apache.spark.sql.types.{DoubleType, LongType, Metadata} +import astraea.spark.rasterframes.expressions.UnaryRasterAggregate +import astraea.spark.rasterframes.expressions.tilestats.{DataCells, Sum} import org.apache.spark.sql.catalyst.dsl.expressions._ -import org.apache.spark.sql.catalyst.expressions._ -import org.apache.spark.sql.functions._ -import org.apache.spark.sql.rf.{TileUDT, _} +import org.apache.spark.sql.catalyst.expressions.{AttributeReference, Expression, _} +import org.apache.spark.sql.types.{DoubleType, LongType, Metadata} +import org.apache.spark.sql.{Column, TypedColumn} /** * Cell mean aggregate function * * @since 10/5/17 */ -case class CellMeanAggregate(child: Expression) extends DeclarativeAggregate { - - override def prettyName: String = "agg_mean" +@ExpressionDescription( + usage = "_FUNC_(tile) - Computes the mean of all cell values.", + examples = """ + Examples: + > SELECT _FUNC_(tile); + .... + """) +case class CellMeanAggregate(child: Expression) extends UnaryRasterAggregate { + override def nodeName: String = "agg_mean" private lazy val sum = AttributeReference("sum", DoubleType, false, Metadata.empty)() @@ -44,38 +50,36 @@ case class CellMeanAggregate(child: Expression) extends DeclarativeAggregate { override lazy val aggBufferAttributes = Seq(sum, count) - val initialValues = Seq( + override val initialValues = Seq( Literal(0.0), Literal(0L) ) - private val dataCellCounts = udf(dataCells) - private val sumCells = udf(tileSum) - - val updateExpressions = Seq( - If(IsNull(child), sum , Add(sum, sumCells(new Column(child)).expr)), - If(IsNull(child), count, Add(count, dataCellCounts(new Column(child)).expr)) + // Cant' figure out why we can't just use the Expression directly + // this is necessary to properly handle null rows. For example, + // if we use `tilestats.Sum` directly, we get an NPE when the stage is executed. + private val DataCellCounts = tileOpAsExpression("data_cells", DataCells.op) + private val SumCells = tileOpAsExpression("sum_cells", Sum.op) + + override val updateExpressions = Seq( + // TODO: Figure out why this doesn't work. See above. + //If(IsNull(child), sum , Add(sum, Sum(child))), + If(IsNull(child), sum , Add(sum, SumCells(child))), + If(IsNull(child), count, Add(count, DataCellCounts(child))) ) - val mergeExpressions = Seq( + override val mergeExpressions = Seq( sum.left + sum.right, count.left + count.right ) - val evaluateExpression = sum / new Cast(count, DoubleType) - - def inputTypes = Seq(TileUDT) - - def nullable = true - - def dataType = DoubleType - - def children = Seq(child) + override val evaluateExpression = sum / new Cast(count, DoubleType) + override def dataType = DoubleType } object CellMeanAggregate { - import astraea.spark.rasterframes.encoders.SparkDefaultEncoders._ + import astraea.spark.rasterframes.encoders.StandardEncoders.PrimitiveEncoders.doubleEnc /** Computes the column aggregate mean. */ def apply(tile: Column): TypedColumn[Any, Double] = new Column(new CellMeanAggregate(tile.expr).toAggregateExpression()).as[Double] diff --git a/core/src/main/scala/astraea/spark/rasterframes/expressions/aggstats/CellStatsAggregate.scala b/core/src/main/scala/astraea/spark/rasterframes/expressions/aggstats/CellStatsAggregate.scala new file mode 100644 index 000000000..cfcde38a5 --- /dev/null +++ b/core/src/main/scala/astraea/spark/rasterframes/expressions/aggstats/CellStatsAggregate.scala @@ -0,0 +1,165 @@ +/* + * This software is licensed under the Apache 2 license, quoted below. + * + * Copyright 2019 Astraea, Inc. + * + * Licensed under the Apache License, Version 2.0 (the "License"); you may not + * use this file except in compliance with the License. You may obtain a copy of + * the License at + * + * [http://www.apache.org/licenses/LICENSE-2.0] + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT + * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the + * License for the specific language governing permissions and limitations under + * the License. + * + * SPDX-License-Identifier: Apache-2.0 + * + */ + +package astraea.spark.rasterframes.expressions.aggstats + +import astraea.spark.rasterframes.expressions.accessors.ExtractTile +import astraea.spark.rasterframes.stats.CellStatistics +import geotrellis.raster.{Tile, _} +import org.apache.spark.sql.catalyst.expressions.aggregate.{AggregateExpression, AggregateFunction, AggregateMode, Complete} +import org.apache.spark.sql.catalyst.expressions.{ExprId, Expression, ExpressionDescription, NamedExpression} +import org.apache.spark.sql.execution.aggregate.ScalaUDAF +import org.apache.spark.sql.expressions.{MutableAggregationBuffer, UserDefinedAggregateFunction} +import org.apache.spark.sql.rf.TileUDT +import org.apache.spark.sql.types.{DataType, _} +import org.apache.spark.sql.{Column, Row, TypedColumn} + +/** + * Statistics aggregation function for a full column of tiles. + * + * @since 4/17/17 + */ +case class CellStatsAggregate() extends UserDefinedAggregateFunction { + import CellStatsAggregate.C + // TODO: rewrite as a DeclarativeAggregate + private val TileType = new TileUDT() + + override def inputSchema: StructType = StructType(StructField("value", TileType) :: Nil) + + override def dataType: DataType = StructType(Seq( + StructField("data_cells", LongType), + StructField("no_data_cells", LongType), + StructField("min", DoubleType), + StructField("max", DoubleType), + StructField("mean", DoubleType), + StructField("variance", DoubleType) + )) + + override def bufferSchema: StructType = StructType(Seq( + StructField("data_cells", LongType), + StructField("no_data_cells", LongType), + StructField("min", DoubleType), + StructField("max", DoubleType), + StructField("sum", DoubleType), + StructField("sumSqr", DoubleType) + )) + + override def deterministic: Boolean = true + + override def initialize(buffer: MutableAggregationBuffer): Unit = { + buffer(C.COUNT) = 0L + buffer(C.NODATA) = 0L + buffer(C.MIN) = Double.MaxValue + buffer(C.MAX) = Double.MinValue + buffer(C.SUM) = 0.0 + buffer(C.SUM_SQRS) = 0.0 + } + + override def update(buffer: MutableAggregationBuffer, input: Row): Unit = { + if (!input.isNullAt(0)) { + val tile = input.getAs[Tile](0) + var count = buffer.getLong(C.COUNT) + var nodata = buffer.getLong(C.NODATA) + var min = buffer.getDouble(C.MIN) + var max = buffer.getDouble(C.MAX) + var sum = buffer.getDouble(C.SUM) + var sumSqr = buffer.getDouble(C.SUM_SQRS) + + tile.foreachDouble( + c => + if (isData(c)) { + count += 1 + min = math.min(min, c) + max = math.max(max, c) + sum = sum + c + sumSqr = sumSqr + c * c + } else nodata += 1) + + buffer(C.COUNT) = count + buffer(C.NODATA) = nodata + buffer(C.MIN) = min + buffer(C.MAX) = max + buffer(C.SUM) = sum + buffer(C.SUM_SQRS) = sumSqr + } + } + + override def merge(buffer1: MutableAggregationBuffer, buffer2: Row): Unit = { + buffer1(C.COUNT) = buffer1.getLong(C.COUNT) + buffer2.getLong(C.COUNT) + buffer1(C.NODATA) = buffer1.getLong(C.NODATA) + buffer2.getLong(C.NODATA) + buffer1(C.MIN) = math.min(buffer1.getDouble(C.MIN), buffer2.getDouble(C.MIN)) + buffer1(C.MAX) = math.max(buffer1.getDouble(C.MAX), buffer2.getDouble(C.MAX)) + buffer1(C.SUM) = buffer1.getDouble(C.SUM) + buffer2.getDouble(C.SUM) + buffer1(C.SUM_SQRS) = buffer1.getDouble(C.SUM_SQRS) + buffer2.getDouble(C.SUM_SQRS) + } + + override def evaluate(buffer: Row): Any = { + val count = buffer.getLong(C.COUNT) + val sum = buffer.getDouble(C.SUM) + val sumSqr = buffer.getDouble(C.SUM_SQRS) + val mean = sum / count + val variance = sumSqr / count - mean * mean + Row(count, buffer(C.NODATA), buffer(C.MIN), buffer(C.MAX), mean, variance) + } +} + +object CellStatsAggregate { + import astraea.spark.rasterframes.encoders.StandardEncoders.cellStatsEncoder + + def apply(col: Column): TypedColumn[Any, CellStatistics] = + new Column(new CellStatsAggregateUDAF(col.expr)) + .as(s"agg_stats($col)") // node renaming in class doesn't seem to propogate + .as[CellStatistics] + + /** Adapter hack to allow UserDefinedAggregateFunction to be referenced as an expression. */ + @ExpressionDescription( + usage = "_FUNC_(tile) - Compute aggregate descriptive cell statistics over a tile column.", + arguments = """ + Arguments: + * tile - tile column to analyze""", + examples = """ + Examples: + > SELECT _FUNC_(tile); + +----------+-------------+---+-----+-------+-----------------+ + |data_cells|no_data_cells|min|max |mean |variance | + +----------+-------------+---+-----+-------+-----------------+ + |960 |40 |1.0|255.0|127.175|5441.704791666667| + +----------+-------------+---+-----+-------+-----------------+""" + ) + class CellStatsAggregateUDAF(aggregateFunction: AggregateFunction, mode: AggregateMode, isDistinct: Boolean, resultId: ExprId) + extends AggregateExpression(aggregateFunction, mode, isDistinct, resultId) { + def this(child: Expression) = this(ScalaUDAF(Seq(ExtractTile(child)), new CellStatsAggregate()), Complete, false, NamedExpression.newExprId) + override def nodeName: String = "agg_stats" + } + object CellStatsAggregateUDAF { + def apply(child: Expression): CellStatsAggregateUDAF = new CellStatsAggregateUDAF(child) + } + + /** Column index values. */ + private object C { + final val COUNT = 0 + final val NODATA = 1 + final val MIN = 2 + final val MAX = 3 + final val SUM = 4 + final val SUM_SQRS = 5 + } +} diff --git a/core/src/main/scala/astraea/spark/rasterframes/expressions/aggstats/HistogramAggregate.scala b/core/src/main/scala/astraea/spark/rasterframes/expressions/aggstats/HistogramAggregate.scala new file mode 100644 index 000000000..7920415da --- /dev/null +++ b/core/src/main/scala/astraea/spark/rasterframes/expressions/aggstats/HistogramAggregate.scala @@ -0,0 +1,125 @@ +/* + * This software is licensed under the Apache 2 license, quoted below. + * + * Copyright 2019 Astraea, Inc. + * + * Licensed under the Apache License, Version 2.0 (the "License"); you may not + * use this file except in compliance with the License. You may obtain a copy of + * the License at + * + * [http://www.apache.org/licenses/LICENSE-2.0] + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT + * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the + * License for the specific language governing permissions and limitations under + * the License. + * + * SPDX-License-Identifier: Apache-2.0 + * + */ + +package astraea.spark.rasterframes.expressions.aggstats + +import java.nio.ByteBuffer + +import astraea.spark.rasterframes.expressions.accessors.ExtractTile +import astraea.spark.rasterframes.functions.safeEval +import astraea.spark.rasterframes.stats.CellHistogram +import geotrellis.raster.Tile +import geotrellis.raster.histogram.{Histogram, StreamingHistogram} +import geotrellis.spark.util.KryoSerializer +import org.apache.spark.sql.catalyst.expressions.aggregate.{AggregateExpression, AggregateFunction, AggregateMode, Complete} +import org.apache.spark.sql.catalyst.expressions.{ExprId, Expression, ExpressionDescription, NamedExpression} +import org.apache.spark.sql.execution.aggregate.ScalaUDAF +import org.apache.spark.sql.expressions.{MutableAggregationBuffer, UserDefinedAggregateFunction} +import org.apache.spark.sql.rf.TileUDT +import org.apache.spark.sql.types._ +import org.apache.spark.sql.{Column, Row, TypedColumn} + +/** + * Histogram aggregation function for a full column of tiles. + * + * @since 4/24/17 + */ +case class HistogramAggregate(numBuckets: Int) extends UserDefinedAggregateFunction { + def this() = this(StreamingHistogram.DEFAULT_NUM_BUCKETS) + // TODO: rewrite as TypedAggregateExpression or similar. + private val TileType = new TileUDT() + + override def inputSchema: StructType = StructType(StructField("value", TileType) :: Nil) + + override def bufferSchema: StructType = StructType(StructField("buffer", BinaryType) :: Nil) + + override def dataType: DataType = CellHistogram.schema + + override def deterministic: Boolean = true + + @transient + private lazy val ser = KryoSerializer.ser.newInstance() + + @inline + private def marshall(hist: Histogram[Double]): Array[Byte] = ser.serialize(hist).array() + + @inline + private def unmarshall(blob: Array[Byte]): Histogram[Double] = ser.deserialize(ByteBuffer.wrap(blob)) + + override def initialize(buffer: MutableAggregationBuffer): Unit = + buffer(0) = marshall(StreamingHistogram(numBuckets)) + + private val safeMerge = (h1: Histogram[Double], h2: Histogram[Double]) ⇒ (h1, h2) match { + case (null, null) => null + case (l, null) => l + case (null, r) => r + case (l, r) => l merge r + } + + override def update(buffer: MutableAggregationBuffer, input: Row): Unit = { + val tile = input.getAs[Tile](0) + val hist1 = unmarshall(buffer.getAs[Array[Byte]](0)) + val hist2 = safeEval(StreamingHistogram.fromTile(_: Tile, numBuckets))(tile) + val updatedHist = safeMerge(hist1, hist2) + buffer(0) = marshall(updatedHist) + } + + override def merge(buffer1: MutableAggregationBuffer, buffer2: Row): Unit = { + val hist1 = unmarshall(buffer1.getAs[Array[Byte]](0)) + val hist2 = unmarshall(buffer2.getAs[Array[Byte]](0)) + val updatedHist = safeMerge(hist1, hist2) + buffer1(0) = marshall(updatedHist) + } + + override def evaluate(buffer: Row): Any = { + val hist = unmarshall(buffer.getAs[Array[Byte]](0)) + CellHistogram(hist) + } +} + +object HistogramAggregate { + import astraea.spark.rasterframes.encoders.StandardEncoders.cellHistEncoder + + def apply(col: Column): TypedColumn[Any, CellHistogram] = + new Column(new HistogramAggregateUDAF(col.expr)) + .as(s"agg_approx_histogram($col)") // node renaming in class doesn't seem to propogate + .as[CellHistogram] + + /** Adapter hack to allow UserDefinedAggregateFunction to be referenced as an expression. */ + @ExpressionDescription( + usage = "_FUNC_(tile) - Compute aggregate cell histogram over a tile column.", + arguments = """ + Arguments: + * tile - tile column to analyze""", + examples = """ + Examples: + > SELECT _FUNC_(tile); + ...""" + ) + class HistogramAggregateUDAF(aggregateFunction: AggregateFunction, mode: AggregateMode, isDistinct: Boolean, resultId: ExprId) + extends AggregateExpression(aggregateFunction, mode, isDistinct, resultId) { + def this(child: Expression) = this(ScalaUDAF(Seq(ExtractTile(child)), new HistogramAggregate()), Complete, false, NamedExpression.newExprId) + override def nodeName: String = "agg_approx_histogram" + } + object HistogramAggregateUDAF { + def apply(child: Expression): HistogramAggregateUDAF = new HistogramAggregateUDAF(child) + } +} diff --git a/core/src/main/scala/astraea/spark/rasterframes/expressions/aggstats/LocalCountAggregate.scala b/core/src/main/scala/astraea/spark/rasterframes/expressions/aggstats/LocalCountAggregate.scala new file mode 100644 index 000000000..f427d9ee3 --- /dev/null +++ b/core/src/main/scala/astraea/spark/rasterframes/expressions/aggstats/LocalCountAggregate.scala @@ -0,0 +1,117 @@ +/* + * This software is licensed under the Apache 2 license, quoted below. + * + * Copyright 2019 Astraea, Inc. + * + * Licensed under the Apache License, Version 2.0 (the "License"); you may not + * use this file except in compliance with the License. You may obtain a copy of + * the License at + * + * [http://www.apache.org/licenses/LICENSE-2.0] + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT + * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the + * License for the specific language governing permissions and limitations under + * the License. + * + * SPDX-License-Identifier: Apache-2.0 + * + */ + +package astraea.spark.rasterframes.expressions.aggstats + +import astraea.spark.rasterframes.expressions.accessors.ExtractTile +import astraea.spark.rasterframes.functions.safeBinaryOp +import geotrellis.raster.mapalgebra.local.{Add, Defined, Undefined} +import geotrellis.raster.{IntConstantNoDataCellType, Tile} +import org.apache.spark.sql.catalyst.expressions.aggregate.{AggregateExpression, AggregateFunction, AggregateMode, Complete} +import org.apache.spark.sql.catalyst.expressions.{ExprId, Expression, ExpressionDescription, NamedExpression} +import org.apache.spark.sql.execution.aggregate.ScalaUDAF +import org.apache.spark.sql.expressions.{MutableAggregationBuffer, UserDefinedAggregateFunction} +import org.apache.spark.sql.rf.TileUDT +import org.apache.spark.sql.types.{DataType, StructField, StructType} +import org.apache.spark.sql.{Column, Row, TypedColumn} + +/** + * Catalyst aggregate function that counts `NoData` values in a cell-wise fashion. + * + * @param isData true if count should be of non-NoData values, false for NoData values. + * @since 8/11/17 + */ +class LocalCountAggregate(isData: Boolean) extends UserDefinedAggregateFunction { + + private val incCount = + if (isData) safeBinaryOp((t1: Tile, t2: Tile) ⇒ Add(t1, Defined(t2))) + else safeBinaryOp((t1: Tile, t2: Tile) ⇒ Add(t1, Undefined(t2))) + + private val add = safeBinaryOp(Add.apply(_: Tile, _: Tile)) + + private val TileType = new TileUDT() + + override def dataType: DataType = TileType + + override def inputSchema: StructType = StructType(Seq( + StructField("value", TileType, true) + )) + + override def bufferSchema: StructType = inputSchema + + override def deterministic: Boolean = true + + override def initialize(buffer: MutableAggregationBuffer): Unit = + buffer(0) = null + + override def update(buffer: MutableAggregationBuffer, input: Row): Unit = { + val right = input.getAs[Tile](0) + if (right != null) { + if (buffer(0) == null) { + buffer(0) = ( + if (isData) Defined(right) else Undefined(right) + ).convert(IntConstantNoDataCellType) + } else { + val left = buffer.getAs[Tile](0) + buffer(0) = incCount(left, right) + } + } + } + + override def merge(buffer1: MutableAggregationBuffer, buffer2: Row): Unit = { + buffer1(0) = add(buffer1.getAs[Tile](0), buffer2.getAs[Tile](0)) + } + + override def evaluate(buffer: Row): Tile = buffer.getAs[Tile](0) +} +object LocalCountAggregate { + import astraea.spark.rasterframes.encoders.StandardEncoders.singlebandTileEncoder + @ExpressionDescription( + usage = "_FUNC_(tile) - Compute cell-wise count of non-no-data values." + ) + class LocalDataCellsUDAF(aggregateFunction: AggregateFunction, mode: AggregateMode, isDistinct: Boolean, resultId: ExprId) extends AggregateExpression(aggregateFunction, mode, isDistinct, resultId) { + def this(child: Expression) = this(ScalaUDAF(Seq(ExtractTile(child)), new LocalCountAggregate(true)), Complete, false, NamedExpression.newExprId) + override def nodeName: String = "agg_local_data_cells" + } + object LocalDataCellsUDAF { + def apply(child: Expression): LocalDataCellsUDAF = new LocalDataCellsUDAF(child) + def apply(tile: Column): TypedColumn[Any, Tile] = + new Column(new LocalDataCellsUDAF(tile.expr)) + .as(s"agg_local_data_cells($tile)") + .as[Tile] + } + + @ExpressionDescription( + usage = "_FUNC_(tile) - Compute cell-wise count of no-data values." + ) + class LocalNoDataCellsUDAF(aggregateFunction: AggregateFunction, mode: AggregateMode, isDistinct: Boolean, resultId: ExprId) extends AggregateExpression(aggregateFunction, mode, isDistinct, resultId) { + def this(child: Expression) = this(ScalaUDAF(Seq(ExtractTile(child)), new LocalCountAggregate(false)), Complete, false, NamedExpression.newExprId) + override def nodeName: String = "agg_local_no_data_cells" + } + object LocalNoDataCellsUDAF { + def apply(child: Expression): LocalNoDataCellsUDAF = new LocalNoDataCellsUDAF(child) + def apply(tile: Column): TypedColumn[Any, Tile] = + new Column(new LocalNoDataCellsUDAF(tile.expr)) + .as(s"agg_local_no_data_cells($tile)") + .as[Tile] + } + +} diff --git a/core/src/main/scala/astraea/spark/rasterframes/expressions/aggstats/LocalMeanAggregate.scala b/core/src/main/scala/astraea/spark/rasterframes/expressions/aggstats/LocalMeanAggregate.scala new file mode 100644 index 000000000..bab1eba20 --- /dev/null +++ b/core/src/main/scala/astraea/spark/rasterframes/expressions/aggstats/LocalMeanAggregate.scala @@ -0,0 +1,82 @@ +/* + * This software is licensed under the Apache 2 license, quoted below. + * + * Copyright 2019 Astraea, Inc. + * + * Licensed under the Apache License, Version 2.0 (the "License"); you may not + * use this file except in compliance with the License. You may obtain a copy of + * the License at + * + * [http://www.apache.org/licenses/LICENSE-2.0] + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT + * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the + * License for the specific language governing permissions and limitations under + * the License. + * + * SPDX-License-Identifier: Apache-2.0 + * + */ + +package astraea.spark.rasterframes.expressions.aggstats + +import astraea.spark.rasterframes.expressions.UnaryRasterAggregate +import astraea.spark.rasterframes.expressions.localops.{Add => AddTiles, Divide => DivideTiles} +import astraea.spark.rasterframes.expressions.transformers.SetCellType +import geotrellis.raster.Tile +import geotrellis.raster.mapalgebra.local +import org.apache.spark.sql.catalyst.expressions.{AttributeReference, Expression, ExpressionDescription, If, IsNull, Literal} +import org.apache.spark.sql.rf.TileUDT +import org.apache.spark.sql.types.DataType +import org.apache.spark.sql.{Column, TypedColumn} + +@ExpressionDescription( + usage = "_FUNC_(tile) - Computes a new tile contining the mean cell values across all tiles in column.", + note = "All tiles in the column must be the same size." +) +case class LocalMeanAggregate(child: Expression) extends UnaryRasterAggregate { + private val TileType = new TileUDT() + + override def dataType: DataType = TileType + override def nodeName: String = "agg_local_mean" + + private lazy val count = + AttributeReference("count", TileType, true)() + private lazy val sum = + AttributeReference("sum", TileType, true)() + + override def aggBufferAttributes: Seq[AttributeReference] = Seq( + count, + sum + ) + + private lazy val Defined = tileOpAsExpression("defined_cells", local.Defined.apply) + + override lazy val initialValues: Seq[Expression] = Seq( + Literal.create(null, TileType), + Literal.create(null, TileType) + ) + override lazy val updateExpressions: Seq[Expression] = Seq( + If(IsNull(count), + SetCellType(Defined(child), Literal("int32")), + If(IsNull(child), count, AddTiles(count, Defined(child))) + ), + If(IsNull(sum), + SetCellType(child, Literal("float64")), + If(IsNull(child), sum, AddTiles(sum, child)) + ) + ) + override val mergeExpressions: Seq[Expression] = Seq( + AddTiles(count.left, count.right), + AddTiles(sum.left, sum.right) + ) + override lazy val evaluateExpression: Expression = DivideTiles(sum, count) +} +object LocalMeanAggregate { + import astraea.spark.rasterframes.encoders.StandardEncoders.singlebandTileEncoder + + def apply(tile: Column): TypedColumn[Any, Tile] = + new Column(new LocalMeanAggregate(tile.expr).toAggregateExpression()).as[Tile] + +} diff --git a/core/src/main/scala/astraea/spark/rasterframes/functions/LocalStatsAggregate.scala b/core/src/main/scala/astraea/spark/rasterframes/expressions/aggstats/LocalStatsAggregate.scala similarity index 61% rename from core/src/main/scala/astraea/spark/rasterframes/functions/LocalStatsAggregate.scala rename to core/src/main/scala/astraea/spark/rasterframes/expressions/aggstats/LocalStatsAggregate.scala index 13c408f14..8df684a25 100644 --- a/core/src/main/scala/astraea/spark/rasterframes/functions/LocalStatsAggregate.scala +++ b/core/src/main/scala/astraea/spark/rasterframes/expressions/aggstats/LocalStatsAggregate.scala @@ -1,28 +1,39 @@ /* - * Copyright 2017 Astraea, Inc. + * This software is licensed under the Apache 2 license, quoted below. * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at + * Copyright 2019 Astraea, Inc. * - * http://www.apache.org/licenses/LICENSE-2.0 + * Licensed under the Apache License, Version 2.0 (the "License"); you may not + * use this file except in compliance with the License. You may obtain a copy of + * the License at + * + * [http://www.apache.org/licenses/LICENSE-2.0] * * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. + * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT + * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the + * License for the specific language governing permissions and limitations under + * the License. + * + * SPDX-License-Identifier: Apache-2.0 + * */ -package astraea.spark.rasterframes.functions +package astraea.spark.rasterframes.expressions.aggstats +import astraea.spark.rasterframes.expressions.accessors.ExtractTile +import astraea.spark.rasterframes.functions.safeBinaryOp +import astraea.spark.rasterframes.stats.LocalCellStatistics +import astraea.spark.rasterframes.util.DataBiasedOp.{BiasedAdd, BiasedMax, BiasedMin} import geotrellis.raster.mapalgebra.local._ -import geotrellis.raster.{DoubleConstantNoDataCellType, IntConstantNoDataCellType, IntUserDefinedNoDataCellType, Tile, isNoData} -import org.apache.spark.sql.Row +import geotrellis.raster.{DoubleConstantNoDataCellType, IntConstantNoDataCellType, IntUserDefinedNoDataCellType, Tile} +import org.apache.spark.sql.catalyst.expressions.aggregate.{AggregateExpression, AggregateFunction, AggregateMode, Complete} +import org.apache.spark.sql.catalyst.expressions.{ExprId, Expression, ExpressionDescription, NamedExpression} +import org.apache.spark.sql.execution.aggregate.ScalaUDAF import org.apache.spark.sql.expressions.{MutableAggregationBuffer, UserDefinedAggregateFunction} -import org.apache.spark.sql.types._ -import DataBiasedOp._ import org.apache.spark.sql.rf.TileUDT +import org.apache.spark.sql.types._ +import org.apache.spark.sql.{Column, Row, TypedColumn} /** @@ -35,7 +46,9 @@ class LocalStatsAggregate() extends UserDefinedAggregateFunction { private val TileType = new TileUDT() - override def inputSchema: StructType = StructType(StructField("value", TileType) :: Nil) + override def inputSchema: StructType = StructType(Seq( + StructField("value", TileType, true) + )) override def dataType: DataType = StructType( @@ -133,6 +146,32 @@ class LocalStatsAggregate() extends UserDefinedAggregateFunction { } object LocalStatsAggregate { + + def apply(col: Column): TypedColumn[Any, LocalCellStatistics] = + new Column(LocalStatsAggregateUDAF(col.expr)) + .as(s"agg_local_stats($col)") + .as[LocalCellStatistics] + + /** Adapter hack to allow UserDefinedAggregateFunction to be referenced as an expression. */ + @ExpressionDescription( + usage = "_FUNC_(tile) - Compute cell-local aggregate descriptive statistics for a column of tiles.", + arguments = """ + Arguments: + * tile - tile column to analyze""", + examples = """ + Examples: + > SELECT _FUNC_(tile); + ...""" + ) + class LocalStatsAggregateUDAF(aggregateFunction: AggregateFunction, mode: AggregateMode, isDistinct: Boolean, resultId: ExprId) + extends AggregateExpression(aggregateFunction, mode, isDistinct, resultId) { + def this(child: Expression) = this(ScalaUDAF(Seq(ExtractTile(child)), new LocalStatsAggregate()), Complete, false, NamedExpression.newExprId) + override def nodeName: String = "agg_local_stats" + } + object LocalStatsAggregateUDAF { + def apply(child: Expression): LocalStatsAggregateUDAF = new LocalStatsAggregateUDAF(child) + } + /** Column index values. */ private object C { val COUNT = 0 diff --git a/core/src/main/scala/astraea/spark/rasterframes/expressions/aggstats/LocalTileOpAggregate.scala b/core/src/main/scala/astraea/spark/rasterframes/expressions/aggstats/LocalTileOpAggregate.scala new file mode 100644 index 000000000..7a5032176 --- /dev/null +++ b/core/src/main/scala/astraea/spark/rasterframes/expressions/aggstats/LocalTileOpAggregate.scala @@ -0,0 +1,103 @@ +/* + * This software is licensed under the Apache 2 license, quoted below. + * + * Copyright 2019 Astraea, Inc. + * + * Licensed under the Apache License, Version 2.0 (the "License"); you may not + * use this file except in compliance with the License. You may obtain a copy of + * the License at + * + * [http://www.apache.org/licenses/LICENSE-2.0] + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT + * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the + * License for the specific language governing permissions and limitations under + * the License. + * + * SPDX-License-Identifier: Apache-2.0 + * + */ + +package astraea.spark.rasterframes.expressions.aggstats + +import astraea.spark.rasterframes.expressions.accessors.ExtractTile +import astraea.spark.rasterframes.functions.safeBinaryOp +import astraea.spark.rasterframes.util.DataBiasedOp.{BiasedMax, BiasedMin} +import geotrellis.raster.Tile +import geotrellis.raster.mapalgebra.local +import geotrellis.raster.mapalgebra.local.LocalTileBinaryOp +import org.apache.spark.sql.{Column, Row, TypedColumn} +import org.apache.spark.sql.catalyst.expressions.{ExprId, Expression, ExpressionDescription, NamedExpression} +import org.apache.spark.sql.catalyst.expressions.aggregate.{AggregateExpression, AggregateFunction, AggregateMode, Complete} +import org.apache.spark.sql.execution.aggregate.ScalaUDAF +import org.apache.spark.sql.expressions.{MutableAggregationBuffer, UserDefinedAggregateFunction} +import org.apache.spark.sql.rf.TileUDT +import org.apache.spark.sql.types._ + +/** + * Aggregation function for applying a [[LocalTileBinaryOp]] pairwise across all tiles. Assumes Monoid algebra. + * + * @since 4/17/17 + */ +class LocalTileOpAggregate(op: LocalTileBinaryOp) extends UserDefinedAggregateFunction { + + private val safeOp = safeBinaryOp(op.apply(_: Tile, _: Tile)) + + private val TileType = new TileUDT() + + override def inputSchema: StructType = StructType(Seq( + StructField("value", TileType, true) + )) + + override def bufferSchema: StructType = inputSchema + + override def dataType: DataType = TileType + + override def deterministic: Boolean = true + + override def initialize(buffer: MutableAggregationBuffer): Unit = + buffer(0) = null + + override def update(buffer: MutableAggregationBuffer, input: Row): Unit = { + if (buffer(0) == null) { + buffer(0) = input(0) + } else { + val t1 = buffer.getAs[Tile](0) + val t2 = input.getAs[Tile](0) + buffer(0) = safeOp(t1, t2) + } + } + + override def merge(buffer1: MutableAggregationBuffer, buffer2: Row): Unit = update(buffer1, buffer2) + + override def evaluate(buffer: Row): Tile = buffer.getAs[Tile](0) +} + +object LocalTileOpAggregate { + import astraea.spark.rasterframes.encoders.StandardEncoders.singlebandTileEncoder + + @ExpressionDescription( + usage = "_FUNC_(tile) - Compute cell-wise minimum value from a tile column." + ) + class LocalMinUDAF(aggregateFunction: AggregateFunction, mode: AggregateMode, isDistinct: Boolean, resultId: ExprId) extends AggregateExpression(aggregateFunction, mode, isDistinct, resultId) { + def this(child: Expression) = this(ScalaUDAF(Seq(ExtractTile(child)), new LocalTileOpAggregate(BiasedMin)), Complete, false, NamedExpression.newExprId) + override def nodeName: String = "agg_local_min" + } + object LocalMinUDAF { + def apply(child: Expression): LocalMinUDAF = new LocalMinUDAF(child) + def apply(tile: Column): TypedColumn[Any, Tile] = new Column(new LocalMinUDAF(tile.expr)).as[Tile] + } + + @ExpressionDescription( + usage = "_FUNC_(tile) - Compute cell-wise maximum value from a tile column." + ) + class LocalMaxUDAF(aggregateFunction: AggregateFunction, mode: AggregateMode, isDistinct: Boolean, resultId: ExprId) extends AggregateExpression(aggregateFunction, mode, isDistinct, resultId) { + def this(child: Expression) = this(ScalaUDAF(Seq(ExtractTile(child)), new LocalTileOpAggregate(BiasedMax)), Complete, false, NamedExpression.newExprId) + override def nodeName: String = "agg_local_max" + } + object LocalMaxUDAF { + def apply(child: Expression): LocalMaxUDAF = new LocalMaxUDAF(child) + def apply(tile: Column): TypedColumn[Any, Tile] = new Column(new LocalMaxUDAF(tile.expr)).as[Tile] + } +} diff --git a/core/src/main/scala/astraea/spark/rasterframes/expressions/ExplodeTiles.scala b/core/src/main/scala/astraea/spark/rasterframes/expressions/generators/ExplodeTiles.scala similarity index 88% rename from core/src/main/scala/astraea/spark/rasterframes/expressions/ExplodeTiles.scala rename to core/src/main/scala/astraea/spark/rasterframes/expressions/generators/ExplodeTiles.scala index 9ecdcd007..e39ca1814 100644 --- a/core/src/main/scala/astraea/spark/rasterframes/expressions/ExplodeTiles.scala +++ b/core/src/main/scala/astraea/spark/rasterframes/expressions/generators/ExplodeTiles.scala @@ -1,7 +1,7 @@ /* * This software is licensed under the Apache 2 license, quoted below. * - * Copyright 2017 Astraea, Inc. + * Copyright 2019 Astraea, Inc. * * Licensed under the Apache License, Version 2.0 (the "License"); you may not * use this file except in compliance with the License. You may obtain a copy of @@ -15,9 +15,11 @@ * License for the specific language governing permissions and limitations under * the License. * + * SPDX-License-Identifier: Apache-2.0 + * */ -package astraea.spark.rasterframes.expressions +package astraea.spark.rasterframes.expressions.generators import astraea.spark.rasterframes._ import astraea.spark.rasterframes.encoders.CatalystSerializer._ @@ -27,6 +29,7 @@ import org.apache.spark.sql._ import org.apache.spark.sql.catalyst.InternalRow import org.apache.spark.sql.catalyst.expressions.codegen.CodegenFallback import org.apache.spark.sql.catalyst.expressions.{Expression, Generator, GenericInternalRow} +import org.apache.spark.sql.rf.TileUDT import org.apache.spark.sql.types._ import spire.syntax.cfor.cfor @@ -36,9 +39,10 @@ import spire.syntax.cfor.cfor * @since 4/12/17 */ case class ExplodeTiles( - sampleFraction: Double = 1.0, seed: Option[Long] = None, override val children: Seq[Expression]) - extends Expression with Generator with CodegenFallback { + sampleFraction: Double , seed: Option[Long], override val children: Seq[Expression]) + extends Expression with Generator with CodegenFallback { + def this(children: Seq[Expression]) = this(1.0, None, children) override def nodeName: String = "explode_tiles" override def elementSchema: StructType = { @@ -64,7 +68,7 @@ case class ExplodeTiles( val tiles = Array.ofDim[Tile](children.length) cfor(0)(_ < tiles.length, _ + 1) { index => val row = children(index).eval(input).asInstanceOf[InternalRow] - tiles(index) = if(row != null) row.to[Tile] else null + tiles(index) = if(row != null) row.to[Tile](TileUDT.tileSerializer) else null } val dims = tiles.filter(_ != null).map(_.dimensions) if(dims.isEmpty) Seq.empty[InternalRow] diff --git a/core/src/main/scala/astraea/spark/rasterframes/expressions/localops/Add.scala b/core/src/main/scala/astraea/spark/rasterframes/expressions/localops/Add.scala new file mode 100644 index 000000000..d7f1a7867 --- /dev/null +++ b/core/src/main/scala/astraea/spark/rasterframes/expressions/localops/Add.scala @@ -0,0 +1,75 @@ +/* + * This software is licensed under the Apache 2 license, quoted below. + * + * Copyright 2019 Astraea, Inc. + * + * Licensed under the Apache License, Version 2.0 (the "License"); you may not + * use this file except in compliance with the License. You may obtain a copy of + * the License at + * + * [http://www.apache.org/licenses/LICENSE-2.0] + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT + * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the + * License for the specific language governing permissions and limitations under + * the License. + * + * SPDX-License-Identifier: Apache-2.0 + * + */ + +package astraea.spark.rasterframes.expressions.localops + +import astraea.spark.rasterframes._ +import astraea.spark.rasterframes.expressions.DynamicExtractors.tileExtractor +import astraea.spark.rasterframes.expressions.{BinaryLocalRasterOp, DynamicExtractors} +import astraea.spark.rasterframes.util.DataBiasedOp.BiasedAdd +import geotrellis.raster.Tile +import org.apache.spark.sql.rf._ +import org.apache.spark.sql.catalyst.InternalRow +import org.apache.spark.sql.catalyst.expressions.codegen.CodegenFallback +import org.apache.spark.sql.catalyst.expressions.{Expression, ExpressionDescription} +import org.apache.spark.sql.functions.lit +import org.apache.spark.sql.{Column, TypedColumn} + +@ExpressionDescription( + usage = "_FUNC_(tile, rhs) - Performs cell-wise addition between two tiles or a tile and a scalar.", + arguments = """ + Arguments: + * tile - left-hand-side tile + * rhs - a tile or scalar value to add to each cell""", + examples = """ + Examples: + > SELECT _FUNC_(tile, 1.5); + ... + > SELECT _FUNC_(tile1, tile2); + ...""" +) +case class Add(left: Expression, right: Expression) extends BinaryLocalRasterOp + with CodegenFallback { + override val nodeName: String = "local_add" + override protected def op(left: Tile, right: Tile): Tile = BiasedAdd(left, right) + override protected def op(left: Tile, right: Double): Tile = BiasedAdd(left, right) + override protected def op(left: Tile, right: Int): Tile = BiasedAdd(left, right) + + override def eval(input: InternalRow): Any = { + if(input == null) null + else { + val l = left.eval(input) + val r = right.eval(input) + if (l == null && r == null) null + else if (l == null) r + else if (r == null && tileExtractor.isDefinedAt(right.dataType)) l + else if (r == null) null + else nullSafeEval(l, r) + } + } +} +object Add { + def apply(left: Column, right: Column): TypedColumn[Any, Tile] = + new Column(Add(left.expr, right.expr)).as[Tile] + + def apply[N: Numeric](tile: Column, value: N): TypedColumn[Any, Tile] = + new Column(Add(tile.expr, lit(value).expr)).as[Tile] +} diff --git a/core/src/main/scala/astraea/spark/rasterframes/expressions/localops/Divide.scala b/core/src/main/scala/astraea/spark/rasterframes/expressions/localops/Divide.scala new file mode 100644 index 000000000..37aa4ab6c --- /dev/null +++ b/core/src/main/scala/astraea/spark/rasterframes/expressions/localops/Divide.scala @@ -0,0 +1,57 @@ +/* + * This software is licensed under the Apache 2 license, quoted below. + * + * Copyright 2019 Astraea, Inc. + * + * Licensed under the Apache License, Version 2.0 (the "License"); you may not + * use this file except in compliance with the License. You may obtain a copy of + * the License at + * + * [http://www.apache.org/licenses/LICENSE-2.0] + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT + * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the + * License for the specific language governing permissions and limitations under + * the License. + * + * SPDX-License-Identifier: Apache-2.0 + * + */ + +package astraea.spark.rasterframes.expressions.localops + +import astraea.spark.rasterframes._ +import astraea.spark.rasterframes.expressions.BinaryLocalRasterOp +import geotrellis.raster.Tile +import org.apache.spark.sql.catalyst.expressions.codegen.CodegenFallback +import org.apache.spark.sql.catalyst.expressions.{Expression, ExpressionDescription} +import org.apache.spark.sql.functions.lit +import org.apache.spark.sql.{Column, TypedColumn} + +@ExpressionDescription( + usage = "_FUNC_(tile, rhs) - Performs cell-wise division between two tiles or a tile and a scalar.", + arguments = """ + Arguments: + * tile - left-hand-side tile + * rhs - a tile or scalar value to add to each cell""", + examples = """ + Examples: + > SELECT _FUNC_(tile, 1.5); + ... + > SELECT _FUNC_(tile1, tile2); + ...""" +) +case class Divide(left: Expression, right: Expression) extends BinaryLocalRasterOp with CodegenFallback { + override val nodeName: String = "local_divide" + override protected def op(left: Tile, right: Tile): Tile = left.localDivide(right) + override protected def op(left: Tile, right: Double): Tile = left.localDivide(right) + override protected def op(left: Tile, right: Int): Tile = left.localDivide(right) +} +object Divide { + def apply(left: Column, right: Column): TypedColumn[Any, Tile] = + new Column(Divide(left.expr, right.expr)).as[Tile] + + def apply[N: Numeric](tile: Column, value: N): TypedColumn[Any, Tile] = + new Column(Divide(tile.expr, lit(value).expr)).as[Tile] +} diff --git a/core/src/main/scala/astraea/spark/rasterframes/expressions/localops/Equal.scala b/core/src/main/scala/astraea/spark/rasterframes/expressions/localops/Equal.scala new file mode 100644 index 000000000..610b8beff --- /dev/null +++ b/core/src/main/scala/astraea/spark/rasterframes/expressions/localops/Equal.scala @@ -0,0 +1,56 @@ +/* + * This software is licensed under the Apache 2 license, quoted below. + * + * Copyright 2019 Astraea, Inc. + * + * Licensed under the Apache License, Version 2.0 (the "License"); you may not + * use this file except in compliance with the License. You may obtain a copy of + * the License at + * + * [http://www.apache.org/licenses/LICENSE-2.0] + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT + * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the + * License for the specific language governing permissions and limitations under + * the License. + * + * SPDX-License-Identifier: Apache-2.0 + * + */ + +package astraea.spark.rasterframes.expressions.localops + +import astraea.spark.rasterframes._ +import astraea.spark.rasterframes.expressions.BinaryLocalRasterOp +import geotrellis.raster.Tile +import org.apache.spark.sql.catalyst.expressions.{Expression, ExpressionDescription} +import org.apache.spark.sql.catalyst.expressions.codegen.CodegenFallback +import org.apache.spark.sql.functions.lit +import org.apache.spark.sql.{Column, TypedColumn} + +@ExpressionDescription( + usage = "_FUNC_(lhs, rhs) - Performs cell-wise equality test between two tiles.", + arguments = """ + Arguments: + * lhs - first tile argument + * rhs - second tile argument""", + examples = """ + Examples: + > SELECT _FUNC_(tile1, tile2); + ...""" +) +case class Equal(left: Expression, right: Expression) extends BinaryLocalRasterOp with CodegenFallback { + override val nodeName: String = "local_equal" + override protected def op(left: Tile, right: Tile): Tile = left.localEqual(right) + override protected def op(left: Tile, right: Double): Tile = left.localEqual(right) + override protected def op(left: Tile, right: Int): Tile = left.localEqual(right) +} + +object Equal { + def apply(left: Column, right: Column): TypedColumn[Any, Tile] = + new Column(Equal(left.expr, right.expr)).as[Tile] + + def apply[N: Numeric](tile: Column, value: N): TypedColumn[Any, Tile] = + new Column(Equal(tile.expr, lit(value).expr)).as[Tile] +} \ No newline at end of file diff --git a/core/src/main/scala/astraea/spark/rasterframes/expressions/localops/Exp.scala b/core/src/main/scala/astraea/spark/rasterframes/expressions/localops/Exp.scala new file mode 100644 index 000000000..40d34ee06 --- /dev/null +++ b/core/src/main/scala/astraea/spark/rasterframes/expressions/localops/Exp.scala @@ -0,0 +1,116 @@ +/* + * This software is licensed under the Apache 2 license, quoted below. + * + * Copyright 2019 Astraea, Inc. + * + * Licensed under the Apache License, Version 2.0 (the "License"); you may not + * use this file except in compliance with the License. You may obtain a copy of + * the License at + * + * [http://www.apache.org/licenses/LICENSE-2.0] + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT + * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the + * License for the specific language governing permissions and limitations under + * the License. + * + * SPDX-License-Identifier: Apache-2.0 + * + */ + +package astraea.spark.rasterframes.expressions.localops + +import astraea.spark.rasterframes._ +import astraea.spark.rasterframes.expressions.{UnaryLocalRasterOp, fpTile} +import geotrellis.raster.Tile +import org.apache.spark.sql.catalyst.expressions.{Expression, ExpressionDescription} +import org.apache.spark.sql.catalyst.expressions.codegen.CodegenFallback +import org.apache.spark.sql.types.DataType +import org.apache.spark.sql.{Column, TypedColumn} + + +@ExpressionDescription( + usage = "_FUNC_(tile) - Performs cell-wise exponential.", + arguments = """ + Arguments: + * tile - input tile""", + examples = """ + Examples: + > SELECT _FUNC_(tile); + ...""" +) +case class Exp(child: Expression) extends UnaryLocalRasterOp with CodegenFallback { + override val nodeName: String = "exp" + + override protected def op(tile: Tile): Tile = fpTile(tile).localPowValue(math.E) + + override def dataType: DataType = child.dataType +} +object Exp { + def apply(tile: Column): TypedColumn[Any, Tile] = + new Column(Exp(tile.expr)).as[Tile] +} + +@ExpressionDescription( + usage = "_FUNC_(tile) - Compute 10 to the power of cell values.", + arguments = """ + Arguments: + * tile - input tile""", + examples = """ + Examples: + > SELECT _FUNC_(tile); + ...""" +) +case class Exp10(child: Expression) extends UnaryLocalRasterOp with CodegenFallback { + override val nodeName: String = "log10" + + override protected def op(tile: Tile): Tile = fpTile(tile).localPowValue(10.0) + + override def dataType: DataType = child.dataType +} +object Exp10 { + def apply(tile: Column): TypedColumn[Any, Tile] = new Column(Exp10(tile.expr)).as[Tile] +} + +@ExpressionDescription( + usage = "_FUNC_(tile) - Compute 2 to the power of cell values.", + arguments = """ + Arguments: + * tile - input tile""", + examples = """ + Examples: + > SELECT _FUNC_(tile); + ...""" +) +case class Exp2(child: Expression) extends UnaryLocalRasterOp with CodegenFallback { + override val nodeName: String = "exp2" + + override protected def op(tile: Tile): Tile = fpTile(tile).localPowValue(2.0) + + override def dataType: DataType = child.dataType +} +object Exp2{ + def apply(tile: Column): TypedColumn[Any, Tile] = new Column(Exp2(tile.expr)).as[Tile] +} + +@ExpressionDescription( + usage = "_FUNC_(tile) - Performs cell-wise exponential, then subtract one.", + arguments = """ + Arguments: + * tile - input tile""", + examples = """ + Examples: + > SELECT _FUNC_(tile); + ...""" +) +case class ExpM1(child: Expression) extends UnaryLocalRasterOp with CodegenFallback { + override val nodeName: String = "expm1" + + override protected def op(tile: Tile): Tile = fpTile(tile).localPowValue(math.E).localSubtract(1.0) + + override def dataType: DataType = child.dataType +} +object ExpM1{ + def apply(tile: Column): TypedColumn[Any, Tile] = new Column(ExpM1(tile.expr)).as[Tile] +} diff --git a/core/src/main/scala/astraea/spark/rasterframes/expressions/localops/Greater.scala b/core/src/main/scala/astraea/spark/rasterframes/expressions/localops/Greater.scala new file mode 100644 index 000000000..f78022972 --- /dev/null +++ b/core/src/main/scala/astraea/spark/rasterframes/expressions/localops/Greater.scala @@ -0,0 +1,55 @@ +/* + * This software is licensed under the Apache 2 license, quoted below. + * + * Copyright 2019 Astraea, Inc. + * + * Licensed under the Apache License, Version 2.0 (the "License"); you may not + * use this file except in compliance with the License. You may obtain a copy of + * the License at + * + * [http://www.apache.org/licenses/LICENSE-2.0] + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT + * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the + * License for the specific language governing permissions and limitations under + * the License. + * + * SPDX-License-Identifier: Apache-2.0 + * + */ +package astraea.spark.rasterframes.expressions.localops + +import astraea.spark.rasterframes._ +import astraea.spark.rasterframes.expressions.BinaryLocalRasterOp +import geotrellis.raster.Tile +import org.apache.spark.sql.catalyst.expressions.{Expression, ExpressionDescription} +import org.apache.spark.sql.catalyst.expressions.codegen.CodegenFallback +import org.apache.spark.sql.functions.lit +import org.apache.spark.sql.{Column, TypedColumn} + +@ExpressionDescription( + usage = "_FUNC_(lhs, rhs) - Performs cell-wise greater-than (>) test between two tiles.", + arguments = """ + Arguments: + * lhs - first tile argument + * rhs - second tile argument""", + examples = """ + Examples: + > SELECT _FUNC_(tile1, tile2); + ...""" +) +case class Greater(left: Expression, right: Expression) extends BinaryLocalRasterOp with CodegenFallback { + override val nodeName: String = "local_greater" + override protected def op(left: Tile, right: Tile): Tile = left.localGreater(right) + override protected def op(left: Tile, right: Double): Tile = left.localGreater(right) + override protected def op(left: Tile, right: Int): Tile = left.localGreater(right) +} + +object Greater { + def apply(left: Column, right: Column): TypedColumn[Any, Tile] = + new Column(Greater(left.expr, right.expr)).as[Tile] + + def apply[N: Numeric](tile: Column, value: N): TypedColumn[Any, Tile] = + new Column(Greater(tile.expr, lit(value).expr)).as[Tile] +} \ No newline at end of file diff --git a/core/src/main/scala/astraea/spark/rasterframes/expressions/localops/GreaterEqual.scala b/core/src/main/scala/astraea/spark/rasterframes/expressions/localops/GreaterEqual.scala new file mode 100644 index 000000000..bf43ceca5 --- /dev/null +++ b/core/src/main/scala/astraea/spark/rasterframes/expressions/localops/GreaterEqual.scala @@ -0,0 +1,56 @@ +/* + * This software is licensed under the Apache 2 license, quoted below. + * + * Copyright 2019 Astraea, Inc. + * + * Licensed under the Apache License, Version 2.0 (the "License"); you may not + * use this file except in compliance with the License. You may obtain a copy of + * the License at + * + * [http://www.apache.org/licenses/LICENSE-2.0] + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT + * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the + * License for the specific language governing permissions and limitations under + * the License. + * + * SPDX-License-Identifier: Apache-2.0 + * + */ + +package astraea.spark.rasterframes.expressions.localops + +import astraea.spark.rasterframes._ +import astraea.spark.rasterframes.expressions.BinaryLocalRasterOp +import geotrellis.raster.Tile +import org.apache.spark.sql.catalyst.expressions.{Expression, ExpressionDescription} +import org.apache.spark.sql.catalyst.expressions.codegen.CodegenFallback +import org.apache.spark.sql.functions.lit +import org.apache.spark.sql.{Column, TypedColumn} + +@ExpressionDescription( + usage = "_FUNC_(lhs, rhs) - Performs cell-wise greater-than-or-equal (>=) test between two tiles.", + arguments = """ + Arguments: + * lhs - first tile argument + * rhs - second tile argument""", + examples = """ + Examples: + > SELECT _FUNC_(tile1, tile2); + ...""" +) +case class GreaterEqual(left: Expression, right: Expression) extends BinaryLocalRasterOp with CodegenFallback { + override val nodeName: String = "local_greater_equal" + override protected def op(left: Tile, right: Tile): Tile = left.localGreaterOrEqual(right) + override protected def op(left: Tile, right: Double): Tile = left.localGreaterOrEqual(right) + override protected def op(left: Tile, right: Int): Tile = left.localGreaterOrEqual(right) +} + +object GreaterEqual { + def apply(left: Column, right: Column): TypedColumn[Any, Tile] = + new Column(GreaterEqual(left.expr, right.expr)).as[Tile] + + def apply[N: Numeric](tile: Column, value: N): TypedColumn[Any, Tile] = + new Column(GreaterEqual(tile.expr, lit(value).expr)).as[Tile] +} \ No newline at end of file diff --git a/core/src/main/scala/astraea/spark/rasterframes/expressions/localops/Less.scala b/core/src/main/scala/astraea/spark/rasterframes/expressions/localops/Less.scala new file mode 100644 index 000000000..4f8d4ad7b --- /dev/null +++ b/core/src/main/scala/astraea/spark/rasterframes/expressions/localops/Less.scala @@ -0,0 +1,54 @@ +/* + * This software is licensed under the Apache 2 license, quoted below. + * + * Copyright 2019 Astraea, Inc. + * + * Licensed under the Apache License, Version 2.0 (the "License"); you may not + * use this file except in compliance with the License. You may obtain a copy of + * the License at + * + * [http://www.apache.org/licenses/LICENSE-2.0] + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT + * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the + * License for the specific language governing permissions and limitations under + * the License. + * + * SPDX-License-Identifier: Apache-2.0 + * + */ +package astraea.spark.rasterframes.expressions.localops + +import astraea.spark.rasterframes._ +import astraea.spark.rasterframes.expressions.BinaryLocalRasterOp +import geotrellis.raster.Tile +import org.apache.spark.sql.catalyst.expressions.{Expression, ExpressionDescription} +import org.apache.spark.sql.catalyst.expressions.codegen.CodegenFallback +import org.apache.spark.sql.functions.lit +import org.apache.spark.sql.{Column, TypedColumn} + +@ExpressionDescription( + usage = "_FUNC_(lhs, rhs) - Performs cell-wise less-than (<) test between two tiles.", + arguments = """ + Arguments: + * lhs - first tile argument + * rhs - second tile argument""", + examples = """ + Examples: + > SELECT _FUNC_(tile1, tile2); + ...""" +) +case class Less(left: Expression, right: Expression) extends BinaryLocalRasterOp with CodegenFallback { + override val nodeName: String = "local_less" + override protected def op(left: Tile, right: Tile): Tile = left.localLess(right) + override protected def op(left: Tile, right: Double): Tile = left.localLess(right) + override protected def op(left: Tile, right: Int): Tile = left.localLess(right) +} +object Less { + def apply(left: Column, right: Column): TypedColumn[Any, Tile] = + new Column(Less(left.expr, right.expr)).as[Tile] + + def apply[N: Numeric](tile: Column, value: N): TypedColumn[Any, Tile] = + new Column(Less(tile.expr, lit(value).expr)).as[Tile] +} diff --git a/core/src/main/scala/astraea/spark/rasterframes/expressions/localops/LessEqual.scala b/core/src/main/scala/astraea/spark/rasterframes/expressions/localops/LessEqual.scala new file mode 100644 index 000000000..983ac7c0d --- /dev/null +++ b/core/src/main/scala/astraea/spark/rasterframes/expressions/localops/LessEqual.scala @@ -0,0 +1,55 @@ +/* + * This software is licensed under the Apache 2 license, quoted below. + * + * Copyright 2019 Astraea, Inc. + * + * Licensed under the Apache License, Version 2.0 (the "License"); you may not + * use this file except in compliance with the License. You may obtain a copy of + * the License at + * + * [http://www.apache.org/licenses/LICENSE-2.0] + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT + * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the + * License for the specific language governing permissions and limitations under + * the License. + * + * SPDX-License-Identifier: Apache-2.0 + * + */ + +package astraea.spark.rasterframes.expressions.localops + +import astraea.spark.rasterframes._ +import astraea.spark.rasterframes.expressions.BinaryLocalRasterOp +import geotrellis.raster.Tile +import org.apache.spark.sql.catalyst.expressions.{Expression, ExpressionDescription} +import org.apache.spark.sql.catalyst.expressions.codegen.CodegenFallback +import org.apache.spark.sql.functions.lit +import org.apache.spark.sql.{Column, TypedColumn} + +@ExpressionDescription( + usage = "_FUNC_(lhs, rhs) - Performs cell-wise less-than-or-equal (<=) test between two tiles.", + arguments = """ + Arguments: + * lhs - first tile argument + * rhs - second tile argument""", + examples = """ + Examples: + > SELECT _FUNC_(tile1, tile2); + ...""" +) +case class LessEqual(left: Expression, right: Expression) extends BinaryLocalRasterOp with CodegenFallback { + override val nodeName: String = "local_less_equal" + override protected def op(left: Tile, right: Tile): Tile = left.localLessOrEqual(right) + override protected def op(left: Tile, right: Double): Tile = left.localLessOrEqual(right) + override protected def op(left: Tile, right: Int): Tile = left.localLessOrEqual(right) +} +object LessEqual { + def apply(left: Column, right: Column): TypedColumn[Any, Tile] = + new Column(LessEqual(left.expr, right.expr)).as[Tile] + + def apply[N: Numeric](tile: Column, value: N): TypedColumn[Any, Tile] = + new Column(LessEqual(tile.expr, lit(value).expr)).as[Tile] +} diff --git a/core/src/main/scala/astraea/spark/rasterframes/expressions/localops/Log.scala b/core/src/main/scala/astraea/spark/rasterframes/expressions/localops/Log.scala new file mode 100644 index 000000000..e2da78ce1 --- /dev/null +++ b/core/src/main/scala/astraea/spark/rasterframes/expressions/localops/Log.scala @@ -0,0 +1,116 @@ +/* + * This software is licensed under the Apache 2 license, quoted below. + * + * Copyright 2019 Astraea, Inc. + * + * Licensed under the Apache License, Version 2.0 (the "License"); you may not + * use this file except in compliance with the License. You may obtain a copy of + * the License at + * + * [http://www.apache.org/licenses/LICENSE-2.0] + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT + * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the + * License for the specific language governing permissions and limitations under + * the License. + * + * SPDX-License-Identifier: Apache-2.0 + * + */ + +package astraea.spark.rasterframes.expressions.localops + +import astraea.spark.rasterframes._ +import astraea.spark.rasterframes.expressions.{UnaryLocalRasterOp, fpTile} +import geotrellis.raster.Tile +import org.apache.spark.sql.catalyst.expressions.{Expression, ExpressionDescription} +import org.apache.spark.sql.catalyst.expressions.codegen.CodegenFallback +import org.apache.spark.sql.types.DataType +import org.apache.spark.sql.{Column, TypedColumn} + + +@ExpressionDescription( + usage = "_FUNC_(tile) - Performs cell-wise natural logarithm.", + arguments = """ + Arguments: + * tile - input tile""", + examples = """ + Examples: + > SELECT _FUNC_(tile); + ...""" +) +case class Log(child: Expression) extends UnaryLocalRasterOp with CodegenFallback { + override val nodeName: String = "log" + + override protected def op(tile: Tile): Tile = fpTile(tile).localLog() + + override def dataType: DataType = child.dataType +} +object Log { + def apply(tile: Column): TypedColumn[Any, Tile] = + new Column(Log(tile.expr)).as[Tile] +} + +@ExpressionDescription( + usage = "_FUNC_(tile) - Performs cell-wise logarithm with base 10.", + arguments = """ + Arguments: + * tile - input tile""", + examples = """ + Examples: + > SELECT _FUNC_(tile); + ...""" +) +case class Log10(child: Expression) extends UnaryLocalRasterOp with CodegenFallback { + override val nodeName: String = "log10" + + override protected def op(tile: Tile): Tile = fpTile(tile).localLog10() + + override def dataType: DataType = child.dataType +} +object Log10 { + def apply(tile: Column): TypedColumn[Any, Tile] = new Column(Log10(tile.expr)).as[Tile] +} + +@ExpressionDescription( + usage = "_FUNC_(tile) - Performs cell-wise logarithm with base 2.", + arguments = """ + Arguments: + * tile - input tile""", + examples = """ + Examples: + > SELECT _FUNC_(tile); + ...""" +) +case class Log2(child: Expression) extends UnaryLocalRasterOp with CodegenFallback { + override val nodeName: String = "log2" + + override protected def op(tile: Tile): Tile = fpTile(tile).localLog() / math.log(2.0) + + override def dataType: DataType = child.dataType +} +object Log2{ + def apply(tile: Column): TypedColumn[Any, Tile] = new Column(Log2(tile.expr)).as[Tile] +} + +@ExpressionDescription( + usage = "_FUNC_(tile) - Performs natural logarithm of cell values plus one.", + arguments = """ + Arguments: + * tile - input tile""", + examples = """ + Examples: + > SELECT _FUNC_(tile); + ...""" +) +case class Log1p(child: Expression) extends UnaryLocalRasterOp with CodegenFallback { + override val nodeName: String = "log1p" + + override protected def op(tile: Tile): Tile = fpTile(tile).localAdd(1.0).localLog() + + override def dataType: DataType = child.dataType +} +object Log1p{ + def apply(tile: Column): TypedColumn[Any, Tile] = new Column(Log1p(tile.expr)).as[Tile] +} diff --git a/core/src/main/scala/astraea/spark/rasterframes/expressions/localops/Multiply.scala b/core/src/main/scala/astraea/spark/rasterframes/expressions/localops/Multiply.scala new file mode 100644 index 000000000..7ed7c76b8 --- /dev/null +++ b/core/src/main/scala/astraea/spark/rasterframes/expressions/localops/Multiply.scala @@ -0,0 +1,56 @@ +/* + * This software is licensed under the Apache 2 license, quoted below. + * + * Copyright 2019 Astraea, Inc. + * + * Licensed under the Apache License, Version 2.0 (the "License"); you may not + * use this file except in compliance with the License. You may obtain a copy of + * the License at + * + * [http://www.apache.org/licenses/LICENSE-2.0] + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT + * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the + * License for the specific language governing permissions and limitations under + * the License. + * + * SPDX-License-Identifier: Apache-2.0 + * + */ + +package astraea.spark.rasterframes.expressions.localops + +import astraea.spark.rasterframes._ +import astraea.spark.rasterframes.expressions.BinaryLocalRasterOp +import geotrellis.raster.Tile +import org.apache.spark.sql.catalyst.expressions.{Expression, ExpressionDescription} +import org.apache.spark.sql.catalyst.expressions.codegen.CodegenFallback +import org.apache.spark.sql.functions.lit +import org.apache.spark.sql.{Column, TypedColumn} + +@ExpressionDescription( + usage = "_FUNC_(tile, rhs) - Performs cell-wise multiplication between two tiles or a tile and a scalar.", + arguments = """ + Arguments: + * tile - left-hand-side tile + * rhs - a tile or scalar value to add to each cell""", + examples = """ + Examples: + > SELECT _FUNC_(tile, 1.5); + ... + > SELECT _FUNC_(tile1, tile2); + ...""" +) +case class Multiply(left: Expression, right: Expression) extends BinaryLocalRasterOp with CodegenFallback { + override val nodeName: String = "local_multiply" + override protected def op(left: Tile, right: Tile): Tile = left.localMultiply(right) + override protected def op(left: Tile, right: Double): Tile = left.localMultiply(right) + override protected def op(left: Tile, right: Int): Tile = left.localMultiply(right) +} +object Multiply { + def apply(left: Column, right: Column): TypedColumn[Any, Tile] = + new Column(Multiply(left.expr, right.expr)).as[Tile] + def apply[N: Numeric](tile: Column, value: N): TypedColumn[Any, Tile] = + new Column(Multiply(tile.expr, lit(value).expr)).as[Tile] +} diff --git a/core/src/main/scala/astraea/spark/rasterframes/expressions/localops/NormalizedDifference.scala b/core/src/main/scala/astraea/spark/rasterframes/expressions/localops/NormalizedDifference.scala new file mode 100644 index 000000000..5760582d6 --- /dev/null +++ b/core/src/main/scala/astraea/spark/rasterframes/expressions/localops/NormalizedDifference.scala @@ -0,0 +1,54 @@ +/* + * This software is licensed under the Apache 2 license, quoted below. + * + * Copyright 2019 Astraea, Inc. + * + * Licensed under the Apache License, Version 2.0 (the "License"); you may not + * use this file except in compliance with the License. You may obtain a copy of + * the License at + * + * [http://www.apache.org/licenses/LICENSE-2.0] + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT + * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the + * License for the specific language governing permissions and limitations under + * the License. + * + * SPDX-License-Identifier: Apache-2.0 + * + */ + +package astraea.spark.rasterframes.expressions.localops +import astraea.spark.rasterframes.expressions.fpTile +import astraea.spark.rasterframes._ +import astraea.spark.rasterframes.expressions.BinaryRasterOp +import geotrellis.raster.Tile +import org.apache.spark.sql.{Column, TypedColumn} +import org.apache.spark.sql.catalyst.expressions.{Expression, ExpressionDescription} +import org.apache.spark.sql.catalyst.expressions.codegen.CodegenFallback + +@ExpressionDescription( + usage = "_FUNC_(left, right) - Computes the normalized difference '(left - right) / (left + right)' between two tile columns", + note = "Common usage includes computing NDVI via red and NIR bands.", + arguments = """ + Arguments: + * left - first tile argument + * right - second tile argument""", + examples = """ + Examples: + > SELECT _FUNC_(nir, red); + ...""" +) +case class NormalizedDifference(left: Expression, right: Expression) extends BinaryRasterOp with CodegenFallback { + override val nodeName: String = "normalized_difference" + override protected def op(left: Tile, right: Tile): Tile = { + val diff = fpTile(left.localSubtract(right)) + val sum = fpTile(left.localAdd(right)) + diff.localDivide(sum) + } +} +object NormalizedDifference { + def apply(left: Column, right: Column): TypedColumn[Any, Tile] = + new Column(NormalizedDifference(left.expr, right.expr)).as[Tile] +} diff --git a/core/src/main/scala/astraea/spark/rasterframes/expressions/localops/Resample.scala b/core/src/main/scala/astraea/spark/rasterframes/expressions/localops/Resample.scala new file mode 100644 index 000000000..fd2ae2f29 --- /dev/null +++ b/core/src/main/scala/astraea/spark/rasterframes/expressions/localops/Resample.scala @@ -0,0 +1,76 @@ +/* + * This software is licensed under the Apache 2 license, quoted below. + * + * Copyright 2019 Astraea, Inc. + * + * Licensed under the Apache License, Version 2.0 (the "License"); you may not + * use this file except in compliance with the License. You may obtain a copy of + * the License at + * + * [http://www.apache.org/licenses/LICENSE-2.0] + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT + * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the + * License for the specific language governing permissions and limitations under + * the License. + * + * SPDX-License-Identifier: Apache-2.0 + * + */ + +package astraea.spark.rasterframes.expressions.localops + +import astraea.spark.rasterframes._ +import astraea.spark.rasterframes.expressions.DynamicExtractors.tileExtractor +import astraea.spark.rasterframes.expressions.BinaryLocalRasterOp +import geotrellis.raster.Tile +import geotrellis.raster.resample.NearestNeighbor +import org.apache.spark.sql.rf._ +import org.apache.spark.sql.catalyst.InternalRow +import org.apache.spark.sql.catalyst.expressions.codegen.CodegenFallback +import org.apache.spark.sql.catalyst.expressions.{Expression, ExpressionDescription} +import org.apache.spark.sql.functions.lit +import org.apache.spark.sql.{Column, TypedColumn} + +@ExpressionDescription( + usage = "_FUNC_(tile, factor) - Resample tile to different size based on scalar factor or tile whose dimension to match. Scalar less than one will downsample tile; greater than one will upsample. Uses nearest-neighbor value.", + arguments = """ + Arguments: + * tile - tile + * rhs - scalar or tile to match dimension""", + examples = """ + Examples: + > SELECT _FUNC_(tile, 2.0); + ... + > SELECT _FUNC_(tile1, tile2); + ...""" +) +case class Resample(left: Expression, right: Expression) extends BinaryLocalRasterOp + with CodegenFallback { + override val nodeName: String = "resample" + override protected def op(left: Tile, right: Tile): Tile = left.resample(right.cols, right.rows, NearestNeighbor) + override protected def op(left: Tile, right: Double): Tile = left.resample((left.cols * right).toInt, + (left.rows * right).toInt, NearestNeighbor) + override protected def op(left: Tile, right: Int): Tile = op(left, right.toDouble) + + override def eval(input: InternalRow): Any = { + if(input == null) null + else { + val l = left.eval(input) + val r = right.eval(input) + if (l == null && r == null) null + else if (l == null) r + else if (r == null && tileExtractor.isDefinedAt(right.dataType)) l + else if (r == null) null + else nullSafeEval(l, r) + } + } +} +object Resample{ + def apply(left: Column, right: Column): TypedColumn[Any, Tile] = + new Column(Resample(left.expr, right.expr)).as[Tile] + + def apply[N: Numeric](tile: Column, value: N): TypedColumn[Any, Tile] = + new Column(Resample(tile.expr, lit(value).expr)).as[Tile] +} diff --git a/core/src/main/scala/astraea/spark/rasterframes/expressions/localops/Round.scala b/core/src/main/scala/astraea/spark/rasterframes/expressions/localops/Round.scala new file mode 100644 index 000000000..010666e17 --- /dev/null +++ b/core/src/main/scala/astraea/spark/rasterframes/expressions/localops/Round.scala @@ -0,0 +1,52 @@ +/* + * This software is licensed under the Apache 2 license, quoted below. + * + * Copyright 2019 Astraea, Inc. + * + * Licensed under the Apache License, Version 2.0 (the "License"); you may not + * use this file except in compliance with the License. You may obtain a copy of + * the License at + * + * [http://www.apache.org/licenses/LICENSE-2.0] + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT + * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the + * License for the specific language governing permissions and limitations under + * the License. + * + * SPDX-License-Identifier: Apache-2.0 + * + */ + +package astraea.spark.rasterframes.expressions.localops + +import astraea.spark.rasterframes._ +import astraea.spark.rasterframes.expressions.{NullToValue, UnaryLocalRasterOp} +import geotrellis.raster.Tile +import org.apache.spark.sql.catalyst.expressions.codegen.CodegenFallback +import org.apache.spark.sql.catalyst.expressions.{Expression, ExpressionDescription} +import org.apache.spark.sql.{Column, TypedColumn} + +@ExpressionDescription( + usage = "_FUNC_(tile) - Round cell values to the nearest integer without changing the cell type.", + arguments = """ + Arguments: + * tile - tile column to round""", + examples = """ + Examples: + > SELECT _FUNC_(tile); + ...""" +) +case class Round(child: Expression) extends UnaryLocalRasterOp + with NullToValue with CodegenFallback { + override def nodeName: String = "round" + override def na: Any = null + override protected def op(child: Tile): Tile = child.localRound() +} +object Round{ + + def apply(tile: Column): TypedColumn[Any, Tile] = + new Column(Round(tile.expr)).as[Tile] + +} diff --git a/core/src/main/scala/astraea/spark/rasterframes/expressions/localops/Subtract.scala b/core/src/main/scala/astraea/spark/rasterframes/expressions/localops/Subtract.scala new file mode 100644 index 000000000..203bb578d --- /dev/null +++ b/core/src/main/scala/astraea/spark/rasterframes/expressions/localops/Subtract.scala @@ -0,0 +1,56 @@ +/* + * This software is licensed under the Apache 2 license, quoted below. + * + * Copyright 2019 Astraea, Inc. + * + * Licensed under the Apache License, Version 2.0 (the "License"); you may not + * use this file except in compliance with the License. You may obtain a copy of + * the License at + * + * [http://www.apache.org/licenses/LICENSE-2.0] + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT + * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the + * License for the specific language governing permissions and limitations under + * the License. + * + * SPDX-License-Identifier: Apache-2.0 + * + */ + +package astraea.spark.rasterframes.expressions.localops +import astraea.spark.rasterframes._ +import astraea.spark.rasterframes.expressions.BinaryLocalRasterOp +import geotrellis.raster.Tile +import org.apache.spark.sql.catalyst.expressions.{Expression, ExpressionDescription} +import org.apache.spark.sql.catalyst.expressions.codegen.CodegenFallback +import org.apache.spark.sql.functions.lit +import org.apache.spark.sql.{Column, TypedColumn} + +@ExpressionDescription( + usage = "_FUNC_(tile, rhs) - Performs cell-wise subtraction between two tiles or a tile and a scalar.", + arguments = """ + Arguments: + * tile - left-hand-side tile + * rhs - a tile or scalar value to add to each cell""", + examples = """ + Examples: + > SELECT _FUNC_(tile, 1.5); + ... + > SELECT _FUNC_(tile1, tile2); + ...""" +) +case class Subtract(left: Expression, right: Expression) extends BinaryLocalRasterOp with CodegenFallback { + override val nodeName: String = "local_subtract" + override protected def op(left: Tile, right: Tile): Tile = left.localSubtract(right) + override protected def op(left: Tile, right: Double): Tile = left.localSubtract(right) + override protected def op(left: Tile, right: Int): Tile = left.localSubtract(right) +} +object Subtract { + def apply(left: Column, right: Column): TypedColumn[Any, Tile] = + new Column(Subtract(left.expr, right.expr)).as[Tile] + + def apply[N: Numeric](tile: Column, value: N): TypedColumn[Any, Tile] = + new Column(Subtract(tile.expr, lit(value).expr)).as[Tile] +} \ No newline at end of file diff --git a/core/src/main/scala/astraea/spark/rasterframes/expressions/localops/Unequal.scala b/core/src/main/scala/astraea/spark/rasterframes/expressions/localops/Unequal.scala new file mode 100644 index 000000000..f3342b9c6 --- /dev/null +++ b/core/src/main/scala/astraea/spark/rasterframes/expressions/localops/Unequal.scala @@ -0,0 +1,56 @@ +/* + * This software is licensed under the Apache 2 license, quoted below. + * + * Copyright 2019 Astraea, Inc. + * + * Licensed under the Apache License, Version 2.0 (the "License"); you may not + * use this file except in compliance with the License. You may obtain a copy of + * the License at + * + * [http://www.apache.org/licenses/LICENSE-2.0] + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT + * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the + * License for the specific language governing permissions and limitations under + * the License. + * + * SPDX-License-Identifier: Apache-2.0 + * + */ + +package astraea.spark.rasterframes.expressions.localops + +import astraea.spark.rasterframes._ +import astraea.spark.rasterframes.expressions.BinaryLocalRasterOp +import geotrellis.raster.Tile +import org.apache.spark.sql.catalyst.expressions.{Expression, ExpressionDescription} +import org.apache.spark.sql.catalyst.expressions.codegen.CodegenFallback +import org.apache.spark.sql.functions.lit +import org.apache.spark.sql.{Column, TypedColumn} + +@ExpressionDescription( + usage = "_FUNC_(lhs, rhs) - Performs cell-wise inequality test between two tiles.", + arguments = """ + Arguments: + * lhs - first tile argument + * rhs - second tile argument""", + examples = """ + Examples: + > SELECT _FUNC_(tile1, tile2); + ...""" +) +case class Unequal(left: Expression, right: Expression) extends BinaryLocalRasterOp with CodegenFallback { + override val nodeName: String = "local_unequal" + override protected def op(left: Tile, right: Tile): Tile = left.localUnequal(right) + override protected def op(left: Tile, right: Double): Tile = left.localUnequal(right) + override protected def op(left: Tile, right: Int): Tile = left.localUnequal(right) +} + +object Unequal { + def apply(left: Column, right: Column): TypedColumn[Any, Tile] = + new Column(Unequal(left.expr, right.expr)).as[Tile] + + def apply[N: Numeric](tile: Column, value: N): TypedColumn[Any, Tile] = + new Column(Unequal(tile.expr, lit(value).expr)).as[Tile] +} \ No newline at end of file diff --git a/core/src/main/scala/astraea/spark/rasterframes/expressions/package.scala b/core/src/main/scala/astraea/spark/rasterframes/expressions/package.scala index 3255dd719..e4c0bcc00 100644 --- a/core/src/main/scala/astraea/spark/rasterframes/expressions/package.scala +++ b/core/src/main/scala/astraea/spark/rasterframes/expressions/package.scala @@ -19,11 +19,21 @@ package astraea.spark.rasterframes -import org.apache.spark.sql.catalyst.InternalRow +import astraea.spark.rasterframes.expressions.accessors._ +import astraea.spark.rasterframes.expressions.aggstats._ +import astraea.spark.rasterframes.expressions.generators._ +import astraea.spark.rasterframes.expressions.localops._ +import astraea.spark.rasterframes.expressions.tilestats._ +import astraea.spark.rasterframes.expressions.transformers._ +import geotrellis.raster.{DoubleConstantNoDataCellType, Tile} +import org.apache.spark.sql.catalyst.{InternalRow, ScalaReflection} import org.apache.spark.sql.catalyst.analysis.FunctionRegistry -import org.apache.spark.sql.rf.VersionShims +import org.apache.spark.sql.catalyst.expressions.{Expression, ScalaUDF} +import org.apache.spark.sql.rf.VersionShims._ import org.apache.spark.sql.{SQLContext, rf} +import scala.util.Try +import scala.reflect.runtime.universe._ /** * Module of Catalyst expressions for efficiently working with tiles. * @@ -31,20 +41,78 @@ import org.apache.spark.sql.{SQLContext, rf} */ package object expressions { private[expressions] def row(input: Any) = input.asInstanceOf[InternalRow] + /** Convert the tile to a floating point type as needed for scalar operations. */ + @inline + private[expressions] + def fpTile(t: Tile) = if (t.cellType.isFloatingPoint) t else t.convert(DoubleConstantNoDataCellType) - /** Unary expression builder builder. */ - private def ub[A, B](f: A ⇒ B)(a: Seq[A]): B = f(a.head) - /** Binary expression builder builder. */ - private def bb[A, B](f: (A, A) ⇒ B)(a: Seq[A]): B = f(a.head, a.last) + /** As opposed to `udf`, this constructs an unwrapped ScalaUDF Expression from a function. */ + private[expressions] + def udfexpr[RT: TypeTag, A1: TypeTag](name: String, f: A1 => RT): Expression => ScalaUDF = (child: Expression) => { + val ScalaReflection.Schema(dataType, nullable) = ScalaReflection.schemaFor[RT] + val inputTypes = Try(ScalaReflection.schemaFor(typeTag[A1]).dataType :: Nil).toOption + ScalaUDF(f, dataType, Seq(child), inputTypes.getOrElse(Nil), nullable = nullable, udfName = Some(name)) + } def register(sqlContext: SQLContext): Unit = { // Expression-oriented functions have a different registration scheme // Currently have to register with the `builtin` registry due to Spark data hiding. val registry: FunctionRegistry = rf.registry(sqlContext) - VersionShims.registerExpression(registry, "rf_explode_tiles", ExplodeTiles.apply(1.0, None, _)) - VersionShims.registerExpression(registry, "rf_cell_type", ub(GetCellType.apply)) - VersionShims.registerExpression(registry, "rf_convert_cell_type", bb(SetCellType.apply)) - VersionShims.registerExpression(registry, "rf_tile_dimensions", ub(GetDimensions.apply)) - VersionShims.registerExpression(registry, "rf_bounds_geometry", ub(BoundsToGeometry.apply)) + + registry.registerExpression[Add]("rf_local_add") + registry.registerExpression[Subtract]("rf_local_subtract") + registry.registerExpression[ExplodeTiles]("rf_explode_tiles") + registry.registerExpression[GetCellType]("rf_cell_type") + registry.registerExpression[SetCellType]("rf_convert_cell_type") + registry.registerExpression[GetDimensions]("rf_tile_dimensions") + registry.registerExpression[BoundsToGeometry]("rf_bounds_geometry") + registry.registerExpression[Subtract]("rf_local_subtract") + registry.registerExpression[Multiply]("rf_local_multiply") + registry.registerExpression[Divide]("rf_local_divide") + registry.registerExpression[NormalizedDifference]("rf_normalized_difference") + registry.registerExpression[Less]("rf_local_less") + registry.registerExpression[Greater]("rf_local_greater") + registry.registerExpression[LessEqual]("rf_local_less_equal") + registry.registerExpression[GreaterEqual]("rf_local_greater_equal") + registry.registerExpression[Equal]("rf_local_equal") + registry.registerExpression[Unequal]("rf_local_unequal") + registry.registerExpression[Sum]("rf_tile_sum") + registry.registerExpression[Round]("rf_round") + registry.registerExpression[Log]("rf_log") + registry.registerExpression[Log10]("rf_log10") + registry.registerExpression[Log2]("rf_log2") + registry.registerExpression[Log1p]("rf_log1p") + registry.registerExpression[Exp]("rf_exp") + registry.registerExpression[Exp10]("rf_exp10") + registry.registerExpression[Exp2]("rf_exp2") + registry.registerExpression[ExpM1]("rf_expm1") + registry.registerExpression[Resample]("rf_resample") + registry.registerExpression[TileToArrayDouble]("rf_tile_to_array_double") + registry.registerExpression[TileToArrayInt]("rf_tile_to_array_int") + registry.registerExpression[DataCells]("rf_data_cells") + registry.registerExpression[NoDataCells]("rf_no_data_cells") + registry.registerExpression[IsNoDataTile]("rf_is_no_data_tile") + registry.registerExpression[TileMin]("rf_tile_min") + registry.registerExpression[TileMax]("rf_tile_max") + registry.registerExpression[TileMean]("rf_tile_mean") + registry.registerExpression[TileStats]("rf_tile_stats") + registry.registerExpression[TileHistogram]("rf_tile_histogram") + registry.registerExpression[CellCountAggregate.DataCells]("rf_agg_data_cells") + registry.registerExpression[CellCountAggregate.NoDataCells]("rf_agg_no_data_cells") + registry.registerExpression[CellStatsAggregate.CellStatsAggregateUDAF]("rf_agg_stats") + registry.registerExpression[HistogramAggregate.HistogramAggregateUDAF]("rf_agg_approx_histogram") + registry.registerExpression[LocalStatsAggregate.LocalStatsAggregateUDAF]("rf_agg_local_stats") + registry.registerExpression[LocalTileOpAggregate.LocalMinUDAF]("rf_agg_local_min") + registry.registerExpression[LocalTileOpAggregate.LocalMaxUDAF]("rf_agg_local_max") + registry.registerExpression[LocalCountAggregate.LocalDataCellsUDAF]("rf_agg_local_data_cells") + registry.registerExpression[LocalCountAggregate.LocalNoDataCellsUDAF]("rf_agg_local_no_data_cells") + registry.registerExpression[LocalMeanAggregate]("rf_agg_local_mean") + + registry.registerExpression[Mask.MaskByDefined]("rf_mask") + registry.registerExpression[Mask.MaskByValue]("rf_mask_by_value") + registry.registerExpression[Mask.InverseMaskByDefined]("rf_inverse_mask") + + registry.registerExpression[DebugRender.RenderAscii]("rf_render_ascii") + registry.registerExpression[DebugRender.RenderMatrix]("rf_render_matrix") } } diff --git a/core/src/main/scala/astraea/spark/rasterframes/expressions/tilestats/DataCells.scala b/core/src/main/scala/astraea/spark/rasterframes/expressions/tilestats/DataCells.scala new file mode 100644 index 000000000..a7d49c4ae --- /dev/null +++ b/core/src/main/scala/astraea/spark/rasterframes/expressions/tilestats/DataCells.scala @@ -0,0 +1,62 @@ +/* + * This software is licensed under the Apache 2 license, quoted below. + * + * Copyright 2019 Astraea, Inc. + * + * Licensed under the Apache License, Version 2.0 (the "License"); you may not + * use this file except in compliance with the License. You may obtain a copy of + * the License at + * + * [http://www.apache.org/licenses/LICENSE-2.0] + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT + * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the + * License for the specific language governing permissions and limitations under + * the License. + * + * SPDX-License-Identifier: Apache-2.0 + * + */ + +package astraea.spark.rasterframes.expressions.tilestats +import astraea.spark.rasterframes.expressions.{UnaryRasterOp, NullToValue} +import astraea.spark.rasterframes.model.TileContext +import geotrellis.raster._ +import org.apache.spark.sql.{Column, TypedColumn} +import org.apache.spark.sql.catalyst.expressions.{Expression, ExpressionDescription} +import org.apache.spark.sql.catalyst.expressions.codegen.CodegenFallback +import org.apache.spark.sql.types.{DataType, LongType} + +@ExpressionDescription( + usage = "_FUNC_(tile) - Counts the number of non-no-data cells in a tile", + arguments = """ + Arguments: + * tile - tile column to analyze""", + examples = """ + Examples: + > SELECT _FUNC_(tile); + 357""" +) +case class DataCells(child: Expression) extends UnaryRasterOp + with CodegenFallback with NullToValue { + override def nodeName: String = "data_cells" + override def dataType: DataType = LongType + override protected def eval(tile: Tile, ctx: Option[TileContext]): Any = DataCells.op(tile) + override def na: Any = 0L +} +object DataCells { + import astraea.spark.rasterframes.encoders.StandardEncoders.PrimitiveEncoders.longEnc + def apply(tile: Column): TypedColumn[Any, Long] = + new Column(DataCells(tile.expr)).as[Long] + + val op = (tile: Tile) => { + var count: Long = 0 + tile.dualForeach( + z ⇒ if(isData(z)) count = count + 1 + ) ( + z ⇒ if(isData(z)) count = count + 1 + ) + count + } +} diff --git a/core/src/main/scala/astraea/spark/rasterframes/expressions/tilestats/IsNoDataTile.scala b/core/src/main/scala/astraea/spark/rasterframes/expressions/tilestats/IsNoDataTile.scala new file mode 100644 index 000000000..7b360a07c --- /dev/null +++ b/core/src/main/scala/astraea/spark/rasterframes/expressions/tilestats/IsNoDataTile.scala @@ -0,0 +1,52 @@ +/* + * This software is licensed under the Apache 2 license, quoted below. + * + * Copyright 2019 Astraea, Inc. + * + * Licensed under the Apache License, Version 2.0 (the "License"); you may not + * use this file except in compliance with the License. You may obtain a copy of + * the License at + * + * [http://www.apache.org/licenses/LICENSE-2.0] + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT + * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the + * License for the specific language governing permissions and limitations under + * the License. + * + * SPDX-License-Identifier: Apache-2.0 + * + */ + +package astraea.spark.rasterframes.expressions.tilestats +import astraea.spark.rasterframes.expressions.{NullToValue, UnaryRasterOp} +import astraea.spark.rasterframes.model.TileContext +import geotrellis.raster._ +import org.apache.spark.sql.{Column, TypedColumn} +import org.apache.spark.sql.catalyst.expressions.codegen.CodegenFallback +import org.apache.spark.sql.catalyst.expressions.{Expression, ExpressionDescription} +import org.apache.spark.sql.types.{BooleanType, DataType} + +@ExpressionDescription( + usage = "_FUNC_(tile) - Produces `true` if all the cells in a given tile are no-data", + arguments = """ + Arguments: + * tile - tile column to analyze""", + examples = """ + Examples: + > SELECT _FUNC_(tile); + false""" +) +case class IsNoDataTile(child: Expression) extends UnaryRasterOp + with CodegenFallback with NullToValue { + override def nodeName: String = "is_no_data_tile" + override def na: Any = true + override def dataType: DataType = BooleanType + override protected def eval(tile: Tile, ctx: Option[TileContext]): Any = tile.isNoDataTile +} +object IsNoDataTile { + import astraea.spark.rasterframes.encoders.StandardEncoders.PrimitiveEncoders.boolEnc + def apply(tile: Column): TypedColumn[Any, Boolean] = + new Column(IsNoDataTile(tile.expr)).as[Boolean] +} diff --git a/core/src/main/scala/astraea/spark/rasterframes/expressions/tilestats/NoDataCells.scala b/core/src/main/scala/astraea/spark/rasterframes/expressions/tilestats/NoDataCells.scala new file mode 100644 index 000000000..89c2ae10b --- /dev/null +++ b/core/src/main/scala/astraea/spark/rasterframes/expressions/tilestats/NoDataCells.scala @@ -0,0 +1,63 @@ +/* + * This software is licensed under the Apache 2 license, quoted below. + * + * Copyright 2019 Astraea, Inc. + * + * Licensed under the Apache License, Version 2.0 (the "License"); you may not + * use this file except in compliance with the License. You may obtain a copy of + * the License at + * + * [http://www.apache.org/licenses/LICENSE-2.0] + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT + * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the + * License for the specific language governing permissions and limitations under + * the License. + * + * SPDX-License-Identifier: Apache-2.0 + * + */ + +package astraea.spark.rasterframes.expressions.tilestats + +import astraea.spark.rasterframes.expressions.{UnaryRasterOp, NullToValue} +import astraea.spark.rasterframes.model.TileContext +import geotrellis.raster._ +import org.apache.spark.sql.{Column, TypedColumn} +import org.apache.spark.sql.catalyst.expressions.{Expression, ExpressionDescription} +import org.apache.spark.sql.catalyst.expressions.codegen.CodegenFallback +import org.apache.spark.sql.types.{DataType, LongType} + +@ExpressionDescription( + usage = "_FUNC_(tile) - Counts the number of no-data cells in a tile", + arguments = """ + Arguments: + * tile - tile column to analyze""", + examples = """ + Examples: + > SELECT _FUNC_(tile); + 12""" +) +case class NoDataCells(child: Expression) extends UnaryRasterOp + with CodegenFallback with NullToValue { + override def nodeName: String = "no_data_cells" + override def dataType: DataType = LongType + override protected def eval(tile: Tile, ctx: Option[TileContext]): Any = NoDataCells.op(tile) + override def na: Any = 0L +} +object NoDataCells { + import astraea.spark.rasterframes.encoders.StandardEncoders.PrimitiveEncoders.longEnc + def apply(tile: Column): TypedColumn[Any, Long] = + new Column(NoDataCells(tile.expr)).as[Long] + + val op = (tile: Tile) => { + var count: Long = 0 + tile.dualForeach( + z ⇒ if(isNoData(z)) count = count + 1 + ) ( + z ⇒ if(isNoData(z)) count = count + 1 + ) + count + } +} diff --git a/core/src/main/scala/astraea/spark/rasterframes/expressions/tilestats/Sum.scala b/core/src/main/scala/astraea/spark/rasterframes/expressions/tilestats/Sum.scala new file mode 100644 index 000000000..cfa10666b --- /dev/null +++ b/core/src/main/scala/astraea/spark/rasterframes/expressions/tilestats/Sum.scala @@ -0,0 +1,57 @@ +/* + * This software is licensed under the Apache 2 license, quoted below. + * + * Copyright 2019 Astraea, Inc. + * + * Licensed under the Apache License, Version 2.0 (the "License"); you may not + * use this file except in compliance with the License. You may obtain a copy of + * the License at + * + * [http://www.apache.org/licenses/LICENSE-2.0] + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT + * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the + * License for the specific language governing permissions and limitations under + * the License. + * + * SPDX-License-Identifier: Apache-2.0 + * + */ + +package astraea.spark.rasterframes.expressions.tilestats +import astraea.spark.rasterframes.expressions.UnaryRasterOp +import astraea.spark.rasterframes.model.TileContext +import geotrellis.raster._ +import org.apache.spark.sql.catalyst.expressions.{Expression, ExpressionDescription} +import org.apache.spark.sql.catalyst.expressions.codegen.CodegenFallback +import org.apache.spark.sql.types.{DataType, DoubleType} +import org.apache.spark.sql.{Column, TypedColumn} + +@ExpressionDescription( + usage = "_FUNC_(tile) - Computes the sum of all the cells in a tile..", + arguments = """ + Arguments: + * tile - tile to sum up""", + examples = """ + Examples: + > SELECT _FUNC_(tile5); + 2135.34""" +) +case class Sum(child: Expression) extends UnaryRasterOp with CodegenFallback { + override def nodeName: String = "tile_sum" + override def dataType: DataType = DoubleType + override protected def eval(tile: Tile, ctx: Option[TileContext]): Any = Sum.op(tile) +} + +object Sum { + import astraea.spark.rasterframes.encoders.StandardEncoders.PrimitiveEncoders.doubleEnc + def apply(tile: Column): TypedColumn[Any, Double] = + new Column(Sum(tile.expr)).as[Double] + + def op = (tile: Tile) => { + var sum: Double = 0.0 + tile.foreachDouble(z ⇒ if(isData(z)) sum = sum + z) + sum + } +} diff --git a/core/src/main/scala/astraea/spark/rasterframes/expressions/tilestats/TileHistogram.scala b/core/src/main/scala/astraea/spark/rasterframes/expressions/tilestats/TileHistogram.scala new file mode 100644 index 000000000..d7fe7d0c1 --- /dev/null +++ b/core/src/main/scala/astraea/spark/rasterframes/expressions/tilestats/TileHistogram.scala @@ -0,0 +1,60 @@ +/* + * This software is licensed under the Apache 2 license, quoted below. + * + * Copyright 2019 Astraea, Inc. + * + * Licensed under the Apache License, Version 2.0 (the "License"); you may not + * use this file except in compliance with the License. You may obtain a copy of + * the License at + * + * [http://www.apache.org/licenses/LICENSE-2.0] + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT + * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the + * License for the specific language governing permissions and limitations under + * the License. + * + * SPDX-License-Identifier: Apache-2.0 + * + */ + +package astraea.spark.rasterframes.expressions.tilestats + +import astraea.spark.rasterframes.expressions.UnaryRasterOp +import astraea.spark.rasterframes.model.TileContext +import astraea.spark.rasterframes.stats.CellHistogram +import geotrellis.raster.Tile +import org.apache.spark.sql.catalyst.CatalystTypeConverters +import org.apache.spark.sql.catalyst.expressions.codegen.CodegenFallback +import org.apache.spark.sql.catalyst.expressions.{Expression, ExpressionDescription} +import org.apache.spark.sql.types.DataType +import org.apache.spark.sql.{Column, TypedColumn} + +@ExpressionDescription( + usage = "_FUNC_(tile) - Computes per-tile histogram.", + arguments = """ + Arguments: + * tile - tile column to analyze""", + examples = """ + Examples: + > SELECT _FUNC_(tile); + ...""" +) +case class TileHistogram(child: Expression) extends UnaryRasterOp + with CodegenFallback { + override def nodeName: String = "tile_histogram" + override protected def eval(tile: Tile, ctx: Option[TileContext]): Any = + TileHistogram.converter(TileHistogram.op(tile)) + override def dataType: DataType = CellHistogram.schema +} + +object TileHistogram { + def apply(tile: Column): TypedColumn[Any, CellHistogram] = + new Column(TileHistogram(tile.expr)).as[CellHistogram] + + private lazy val converter = CatalystTypeConverters.createToCatalystConverter(CellHistogram.schema) + + /** Single tile histogram. */ + val op = (t: Tile) ⇒ CellHistogram(t) +} \ No newline at end of file diff --git a/core/src/main/scala/astraea/spark/rasterframes/expressions/tilestats/TileMax.scala b/core/src/main/scala/astraea/spark/rasterframes/expressions/tilestats/TileMax.scala new file mode 100644 index 000000000..0e2595b2a --- /dev/null +++ b/core/src/main/scala/astraea/spark/rasterframes/expressions/tilestats/TileMax.scala @@ -0,0 +1,62 @@ +/* + * This software is licensed under the Apache 2 license, quoted below. + * + * Copyright 2019 Astraea, Inc. + * + * Licensed under the Apache License, Version 2.0 (the "License"); you may not + * use this file except in compliance with the License. You may obtain a copy of + * the License at + * + * [http://www.apache.org/licenses/LICENSE-2.0] + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT + * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the + * License for the specific language governing permissions and limitations under + * the License. + * + * SPDX-License-Identifier: Apache-2.0 + * + */ + +package astraea.spark.rasterframes.expressions.tilestats + +import astraea.spark.rasterframes.expressions.{NullToValue, UnaryRasterOp} +import astraea.spark.rasterframes.model.TileContext +import geotrellis.raster.{Tile, isData} +import org.apache.spark.sql.catalyst.expressions.codegen.CodegenFallback +import org.apache.spark.sql.catalyst.expressions.{Expression, ExpressionDescription} +import org.apache.spark.sql.types.{DataType, DoubleType} +import org.apache.spark.sql.{Column, TypedColumn} + +@ExpressionDescription( + usage = "_FUNC_(tile) - Determines the maximum cell value.", + arguments = """ + Arguments: + * tile - tile column to analyze""", + examples = """ + Examples: + > SELECT _FUNC_(tile); + 1""" +) +case class TileMax(child: Expression) extends UnaryRasterOp + with NullToValue with CodegenFallback { + override def nodeName: String = "tile_max" + override protected def eval(tile: Tile, ctx: Option[TileContext]): Any = TileMax.op(tile) + override def dataType: DataType = DoubleType + override def na: Any = Double.MinValue +} +object TileMax { + import astraea.spark.rasterframes.encoders.StandardEncoders.PrimitiveEncoders.doubleEnc + + def apply(tile: Column): TypedColumn[Any, Double] = + new Column(TileMax(tile.expr)).as[Double] + + /** Find the maximum cell value. */ + val op = (tile: Tile) ⇒ { + var max: Double = Double.MinValue + tile.foreachDouble(z ⇒ if(isData(z)) max = math.max(max, z)) + if (max == Double.MinValue) Double.NaN + else max + } +} \ No newline at end of file diff --git a/core/src/main/scala/astraea/spark/rasterframes/expressions/tilestats/TileMean.scala b/core/src/main/scala/astraea/spark/rasterframes/expressions/tilestats/TileMean.scala new file mode 100644 index 000000000..e23e68c08 --- /dev/null +++ b/core/src/main/scala/astraea/spark/rasterframes/expressions/tilestats/TileMean.scala @@ -0,0 +1,67 @@ +/* + * This software is licensed under the Apache 2 license, quoted below. + * + * Copyright 2019 Astraea, Inc. + * + * Licensed under the Apache License, Version 2.0 (the "License"); you may not + * use this file except in compliance with the License. You may obtain a copy of + * the License at + * + * [http://www.apache.org/licenses/LICENSE-2.0] + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT + * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the + * License for the specific language governing permissions and limitations under + * the License. + * + * SPDX-License-Identifier: Apache-2.0 + * + */ + +package astraea.spark.rasterframes.expressions.tilestats + +import astraea.spark.rasterframes.expressions.{NullToValue, UnaryRasterOp} +import astraea.spark.rasterframes.functions.safeEval +import astraea.spark.rasterframes.model.TileContext +import geotrellis.raster.{Tile, isData} +import org.apache.spark.sql.catalyst.expressions.codegen.CodegenFallback +import org.apache.spark.sql.catalyst.expressions.{Expression, ExpressionDescription} +import org.apache.spark.sql.types.{DataType, DoubleType} +import org.apache.spark.sql.{Column, TypedColumn} + +@ExpressionDescription( + usage = "_FUNC_(tile) - Computes the mean cell value of a tile.", + arguments = """ + Arguments: + * tile - tile column to analyze""", + examples = """ + Examples: + > SELECT _FUNC_(tile); + -1""" +) +case class TileMean(child: Expression) extends UnaryRasterOp + with NullToValue with CodegenFallback { + override def nodeName: String = "tile_mean" + override protected def eval(tile: Tile, ctx: Option[TileContext]): Any = TileMean.op(tile) + override def dataType: DataType = DoubleType + override def na: Any = Double.NaN +} +object TileMean { + import astraea.spark.rasterframes.encoders.StandardEncoders.PrimitiveEncoders.doubleEnc + + def apply(tile: Column): TypedColumn[Any, Double] = + new Column(TileMean(tile.expr)).as[Double] + + /** Single tile mean. */ + val op = (t: Tile) ⇒ { + var sum: Double = 0.0 + var count: Long = 0 + t.dualForeach( + z ⇒ if(isData(z)) { count = count + 1; sum = sum + z } + ) ( + z ⇒ if(isData(z)) { count = count + 1; sum = sum + z } + ) + sum/count + } +} \ No newline at end of file diff --git a/core/src/main/scala/astraea/spark/rasterframes/expressions/tilestats/TileMin.scala b/core/src/main/scala/astraea/spark/rasterframes/expressions/tilestats/TileMin.scala new file mode 100644 index 000000000..4d2edc9b3 --- /dev/null +++ b/core/src/main/scala/astraea/spark/rasterframes/expressions/tilestats/TileMin.scala @@ -0,0 +1,62 @@ +/* + * This software is licensed under the Apache 2 license, quoted below. + * + * Copyright 2019 Astraea, Inc. + * + * Licensed under the Apache License, Version 2.0 (the "License"); you may not + * use this file except in compliance with the License. You may obtain a copy of + * the License at + * + * [http://www.apache.org/licenses/LICENSE-2.0] + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT + * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the + * License for the specific language governing permissions and limitations under + * the License. + * + * SPDX-License-Identifier: Apache-2.0 + * + */ + +package astraea.spark.rasterframes.expressions.tilestats + +import astraea.spark.rasterframes.expressions.{NullToValue, UnaryRasterOp} +import astraea.spark.rasterframes.model.TileContext +import geotrellis.raster.{Tile, isData} +import org.apache.spark.sql.catalyst.expressions.codegen.CodegenFallback +import org.apache.spark.sql.catalyst.expressions.{Expression, ExpressionDescription} +import org.apache.spark.sql.types.{DataType, DoubleType} +import org.apache.spark.sql.{Column, TypedColumn} + +@ExpressionDescription( + usage = "_FUNC_(tile) - Determines the minimum cell value.", + arguments = """ + Arguments: + * tile - tile column to analyze""", + examples = """ + Examples: + > SELECT _FUNC_(tile); + -1""" +) +case class TileMin(child: Expression) extends UnaryRasterOp + with NullToValue with CodegenFallback { + override def nodeName: String = "tile_min" + override protected def eval(tile: Tile, ctx: Option[TileContext]): Any = TileMin.op(tile) + override def dataType: DataType = DoubleType + override def na: Any = Double.MaxValue +} +object TileMin { + import astraea.spark.rasterframes.encoders.StandardEncoders.PrimitiveEncoders.doubleEnc + + def apply(tile: Column): TypedColumn[Any, Double] = + new Column(TileMin(tile.expr)).as[Double] + + /** Find the minimum cell value. */ + val op = (tile: Tile) ⇒ { + var min: Double = Double.MaxValue + tile.foreachDouble(z ⇒ if(isData(z)) min = math.min(min, z)) + if (min == Double.MaxValue) Double.NaN + else min + } +} diff --git a/core/src/main/scala/astraea/spark/rasterframes/expressions/tilestats/TileStats.scala b/core/src/main/scala/astraea/spark/rasterframes/expressions/tilestats/TileStats.scala new file mode 100644 index 000000000..015f048e8 --- /dev/null +++ b/core/src/main/scala/astraea/spark/rasterframes/expressions/tilestats/TileStats.scala @@ -0,0 +1,59 @@ +/* + * This software is licensed under the Apache 2 license, quoted below. + * + * Copyright 2019 Astraea, Inc. + * + * Licensed under the Apache License, Version 2.0 (the "License"); you may not + * use this file except in compliance with the License. You may obtain a copy of + * the License at + * + * [http://www.apache.org/licenses/LICENSE-2.0] + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT + * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the + * License for the specific language governing permissions and limitations under + * the License. + * + * SPDX-License-Identifier: Apache-2.0 + * + */ + +package astraea.spark.rasterframes.expressions.tilestats + +import astraea.spark.rasterframes.expressions.UnaryRasterOp +import astraea.spark.rasterframes.model.TileContext +import astraea.spark.rasterframes.stats.CellStatistics +import geotrellis.raster.Tile +import org.apache.spark.sql.catalyst.CatalystTypeConverters +import org.apache.spark.sql.catalyst.expressions.codegen.CodegenFallback +import org.apache.spark.sql.catalyst.expressions.{Expression, ExpressionDescription} +import org.apache.spark.sql.types.DataType +import org.apache.spark.sql.{Column, TypedColumn} + +@ExpressionDescription( + usage = "_FUNC_(tile) - Computes per-tile descriptive statistics.", + arguments = """ + Arguments: + * tile - tile column to analyze""", + examples = """ + Examples: + > SELECT _FUNC_(tile); + ...""" +) +case class TileStats(child: Expression) extends UnaryRasterOp + with CodegenFallback { + override def nodeName: String = "tile_stats" + override protected def eval(tile: Tile, ctx: Option[TileContext]): Any = + TileStats.converter(TileStats.op(tile).orNull) + override def dataType: DataType = CellStatistics.schema +} +object TileStats { + def apply(tile: Column): TypedColumn[Any, CellStatistics] = + new Column(TileStats(tile.expr)).as[CellStatistics] + + private lazy val converter = CatalystTypeConverters.createToCatalystConverter(CellStatistics.schema) + + /** Single tile statistics. */ + val op = (t: Tile) ⇒ CellStatistics(t) +} \ No newline at end of file diff --git a/core/src/main/scala/astraea/spark/rasterframes/expressions/BoundsToGeometry.scala b/core/src/main/scala/astraea/spark/rasterframes/expressions/transformers/BoundsToGeometry.scala similarity index 94% rename from core/src/main/scala/astraea/spark/rasterframes/expressions/BoundsToGeometry.scala rename to core/src/main/scala/astraea/spark/rasterframes/expressions/transformers/BoundsToGeometry.scala index 0f07549c2..9d6a8c652 100644 --- a/core/src/main/scala/astraea/spark/rasterframes/expressions/BoundsToGeometry.scala +++ b/core/src/main/scala/astraea/spark/rasterframes/expressions/transformers/BoundsToGeometry.scala @@ -1,7 +1,7 @@ /* * This software is licensed under the Apache 2 license, quoted below. * - * Copyright 2018 Astraea, Inc. + * Copyright 2019 Astraea, Inc. * * Licensed under the Apache License, Version 2.0 (the "License"); you may not * use this file except in compliance with the License. You may obtain a copy of @@ -19,10 +19,11 @@ * */ -package astraea.spark.rasterframes.expressions +package astraea.spark.rasterframes.expressions.transformers import astraea.spark.rasterframes.encoders.CatalystSerializer import astraea.spark.rasterframes.encoders.CatalystSerializer._ +import astraea.spark.rasterframes.expressions.row import com.vividsolutions.jts.geom.{Envelope, Geometry} import geotrellis.vector.Extent import org.apache.spark.sql.catalyst.analysis.TypeCheckResult diff --git a/core/src/main/scala/astraea/spark/rasterframes/expressions/transformers/DebugRender.scala b/core/src/main/scala/astraea/spark/rasterframes/expressions/transformers/DebugRender.scala new file mode 100644 index 000000000..c26cc6b51 --- /dev/null +++ b/core/src/main/scala/astraea/spark/rasterframes/expressions/transformers/DebugRender.scala @@ -0,0 +1,77 @@ +/* + * This software is licensed under the Apache 2 license, quoted below. + * + * Copyright 2019 Astraea, Inc. + * + * Licensed under the Apache License, Version 2.0 (the "License"); you may not + * use this file except in compliance with the License. You may obtain a copy of + * the License at + * + * [http://www.apache.org/licenses/LICENSE-2.0] + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT + * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the + * License for the specific language governing permissions and limitations under + * the License. + * + * SPDX-License-Identifier: Apache-2.0 + * + */ + +package astraea.spark.rasterframes.expressions.transformers +import astraea.spark.rasterframes.expressions.UnaryRasterOp +import astraea.spark.rasterframes.model.TileContext +import astraea.spark.rasterframes.util.TileAsMatrix +import geotrellis.raster.Tile +import geotrellis.raster.render.ascii.AsciiArtEncoder +import org.apache.spark.sql.catalyst.expressions.codegen.CodegenFallback +import org.apache.spark.sql.catalyst.expressions.{Expression, ExpressionDescription} +import org.apache.spark.sql.types.{DataType, StringType} +import org.apache.spark.sql.{Column, TypedColumn} +import org.apache.spark.unsafe.types.UTF8String + +abstract class DebugRender(asciiArt: Boolean) extends UnaryRasterOp + with CodegenFallback with Serializable { + override def dataType: DataType = StringType + + override protected def eval(tile: Tile, ctx: Option[TileContext]): Any = { + UTF8String.fromString(if (asciiArt) + s"\n${tile.renderAscii(AsciiArtEncoder.Palette.NARROW)}\n" + else + s"\n${tile.renderMatrix(6)}\n" + ) + } +} + +object DebugRender { + import astraea.spark.rasterframes.encoders.StandardEncoders.PrimitiveEncoders.stringEnc + + @ExpressionDescription( + usage = "_FUNC_(tile) - Coverts the contents of the given tile an ASCII art string rendering", + arguments = """ + Arguments: + * tile - tile to render""" + ) + case class RenderAscii(child: Expression) extends DebugRender(true) { + override def nodeName: String = "render_ascii" + } + object RenderAscii { + def apply(tile: Column): TypedColumn[Any, String] = + new Column(RenderAscii(tile.expr)).as[String] + } + + @ExpressionDescription( + usage = "_FUNC_(tile) - Coverts the contents of the given tile to a 2-d array of numberic values", + arguments = """ + Arguments: + * tile - tile to render""" + ) + case class RenderMatrix(child: Expression) extends DebugRender(false) { + override def nodeName: String = "render_matrix" + } + object RenderMatrix { + def apply(tile: Column): TypedColumn[Any, String] = + new Column(RenderMatrix(tile.expr)).as[String] + } +} diff --git a/core/src/main/scala/astraea/spark/rasterframes/expressions/GeometryToBounds.scala b/core/src/main/scala/astraea/spark/rasterframes/expressions/transformers/GeometryToBounds.scala similarity index 96% rename from core/src/main/scala/astraea/spark/rasterframes/expressions/GeometryToBounds.scala rename to core/src/main/scala/astraea/spark/rasterframes/expressions/transformers/GeometryToBounds.scala index bee66a7a9..4e08ad9ea 100644 --- a/core/src/main/scala/astraea/spark/rasterframes/expressions/GeometryToBounds.scala +++ b/core/src/main/scala/astraea/spark/rasterframes/expressions/transformers/GeometryToBounds.scala @@ -1,7 +1,7 @@ /* * This software is licensed under the Apache 2 license, quoted below. * - * Copyright 2018 Astraea, Inc. + * Copyright 2019 Astraea, Inc. * * Licensed under the Apache License, Version 2.0 (the "License"); you may not * use this file except in compliance with the License. You may obtain a copy of @@ -19,8 +19,7 @@ * */ -package astraea.spark.rasterframes.expressions - +package astraea.spark.rasterframes.expressions.transformers import astraea.spark.rasterframes.encoders.CatalystSerializer import astraea.spark.rasterframes.encoders.CatalystSerializer._ diff --git a/core/src/main/scala/astraea/spark/rasterframes/expressions/transformers/Mask.scala b/core/src/main/scala/astraea/spark/rasterframes/expressions/transformers/Mask.scala new file mode 100644 index 000000000..03e81efc2 --- /dev/null +++ b/core/src/main/scala/astraea/spark/rasterframes/expressions/transformers/Mask.scala @@ -0,0 +1,145 @@ +/* + * This software is licensed under the Apache 2 license, quoted below. + * + * Copyright 2019 Astraea, Inc. + * + * Licensed under the Apache License, Version 2.0 (the "License"); you may not + * use this file except in compliance with the License. You may obtain a copy of + * the License at + * + * [http://www.apache.org/licenses/LICENSE-2.0] + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT + * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the + * License for the specific language governing permissions and limitations under + * the License. + * + * SPDX-License-Identifier: Apache-2.0 + * + */ + +package astraea.spark.rasterframes.expressions.transformers +import astraea.spark.rasterframes.encoders.CatalystSerializer._ +import astraea.spark.rasterframes.expressions.DynamicExtractors._ +import astraea.spark.rasterframes.expressions.row +import com.typesafe.scalalogging.LazyLogging +import geotrellis.raster +import geotrellis.raster.Tile +import geotrellis.raster.mapalgebra.local.{Defined, InverseMask => gtInverseMask, Mask => gtMask} +import org.apache.spark.sql.catalyst.analysis.TypeCheckResult +import org.apache.spark.sql.catalyst.analysis.TypeCheckResult.{TypeCheckFailure, TypeCheckSuccess} +import org.apache.spark.sql.catalyst.expressions.codegen.CodegenFallback +import org.apache.spark.sql.catalyst.expressions.{Expression, ExpressionDescription, Literal, TernaryExpression} +import org.apache.spark.sql.rf.TileUDT +import org.apache.spark.sql.types.DataType +import org.apache.spark.sql.{Column, TypedColumn} + +abstract class Mask(val left: Expression, val middle: Expression, val right: Expression, inverse: Boolean) + extends TernaryExpression with CodegenFallback with Serializable with LazyLogging { + + override def children: Seq[Expression] = Seq(left, middle, right) + + override def checkInputDataTypes(): TypeCheckResult = { + if (!tileExtractor.isDefinedAt(left.dataType)) { + TypeCheckFailure(s"Input type '${left.dataType}' does not conform to a raster type.") + } else if (!tileExtractor.isDefinedAt(middle.dataType)) { + TypeCheckFailure(s"Input type '${middle.dataType}' does not conform to a raster type.") + } else if (!intArgExtractor.isDefinedAt(right.dataType)) { + TypeCheckFailure(s"Input type '${right.dataType}' isn't an integral type.") + } else TypeCheckSuccess + } + override def dataType: DataType = left.dataType + + override protected def nullSafeEval(leftInput: Any, middleInput: Any, rightInput: Any): Any = { + implicit val tileSer = TileUDT.tileSerializer + val (leftTile, leftCtx) = tileExtractor(left.dataType)(row(leftInput)) + val (rightTile, rightCtx) = tileExtractor(middle.dataType)(row(middleInput)) + + if (leftCtx.isEmpty && rightCtx.isDefined) + logger.warn( + s"Right-hand parameter '${middle}' provided an extent and CRS, but the left-hand parameter " + + s"'${left}' didn't have any. Because the left-hand side defines output type, the right-hand context will be lost.") + + if (leftCtx.isDefined && rightCtx.isDefined && leftCtx != rightCtx) + logger.warn(s"Both '${left}' and '${middle}' provided an extent and CRS, but they are different. Left-hand side will be used.") + + val maskValue = intArgExtractor(right.dataType)(rightInput) + + val masking = if (maskValue.value == 0) Defined(rightTile) + else rightTile + + val result = if (inverse) + gtInverseMask(leftTile, masking, maskValue.value, raster.NODATA) + else + gtMask(leftTile, masking, maskValue.value, raster.NODATA) + + leftCtx match { + case Some(ctx) => ctx.toProjectRasterTile(result).toInternalRow + case None => result.toInternalRow + } + } +} +object Mask { + import astraea.spark.rasterframes.encoders.StandardEncoders.singlebandTileEncoder + + @ExpressionDescription( + usage = "_FUNC_(target, mask) - Generate a tile with the values from the data tile, but where cells in the masking tile contain NODATA, replace the data value with NODATA.", + arguments = """ + Arguments: + * target - tile to mask + * mask - masking definition""", + examples = """ + Examples: + > SELECT _FUNC_(target, mask); + ...""" + ) + case class MaskByDefined(target: Expression, mask: Expression) + extends Mask(target, mask, Literal(0), false) { + override def nodeName: String = "mask" + } + object MaskByDefined { + def apply(targetTile: Column, maskTile: Column): TypedColumn[Any, Tile] = + new Column(MaskByDefined(targetTile.expr, maskTile.expr)).as[Tile] + } + + @ExpressionDescription( + usage = "_FUNC_(target, mask) - Generate a tile with the values from the data tile, but where cells in the masking tile DO NOT contain NODATA, replace the data value with NODATA", + arguments = """ + Arguments: + * target - tile to mask + * mask - masking definition""", + examples = """ + Examples: + > SELECT _FUNC_(target, mask); + ...""" + ) + case class InverseMaskByDefined(leftTile: Expression, rightTile: Expression) + extends Mask(leftTile, rightTile, Literal(0), true) { + override def nodeName: String = "inverse_mask" + } + object InverseMaskByDefined { + def apply(srcTile: Column, maskingTile: Column): TypedColumn[Any, Tile] = + new Column(InverseMaskByDefined(srcTile.expr, maskingTile.expr)).as[Tile] + } + + @ExpressionDescription( + usage = "_FUNC_(target, mask, maskValue) - Generate a tile with the values from the data tile, but where cells in the masking tile contain the masking value, replace the data value with NODATA.", + arguments = """ + Arguments: + * target - tile to mask + * mask - masking definition""", + examples = """ + Examples: + > SELECT _FUNC_(target, mask, maskValue); + ...""" + ) + case class MaskByValue(leftTile: Expression, rightTile: Expression, maskValue: Expression) + extends Mask(leftTile, rightTile, maskValue, false) { + override def nodeName: String = "mask_by_value" + } + object MaskByValue { + def apply(srcTile: Column, maskingTile: Column, maskValue: Column): TypedColumn[Any, Tile] = + new Column(MaskByValue(srcTile.expr, maskingTile.expr, maskValue.expr)).as[Tile] + } +} diff --git a/core/src/main/scala/astraea/spark/rasterframes/expressions/RasterRefToTile.scala b/core/src/main/scala/astraea/spark/rasterframes/expressions/transformers/RasterRefToTile.scala similarity index 91% rename from core/src/main/scala/astraea/spark/rasterframes/expressions/RasterRefToTile.scala rename to core/src/main/scala/astraea/spark/rasterframes/expressions/transformers/RasterRefToTile.scala index a2a9a961b..c3aa3f337 100644 --- a/core/src/main/scala/astraea/spark/rasterframes/expressions/RasterRefToTile.scala +++ b/core/src/main/scala/astraea/spark/rasterframes/expressions/transformers/RasterRefToTile.scala @@ -1,7 +1,7 @@ /* * This software is licensed under the Apache 2 license, quoted below. * - * Copyright 2018 Astraea, Inc. + * Copyright 2019 Astraea, Inc. * * Licensed under the Apache License, Version 2.0 (the "License"); you may not * use this file except in compliance with the License. You may obtain a copy of @@ -19,18 +19,19 @@ * */ -package astraea.spark.rasterframes.expressions +package astraea.spark.rasterframes.expressions.transformers import astraea.spark.rasterframes.encoders.CatalystSerializer import astraea.spark.rasterframes.encoders.CatalystSerializer._ +import astraea.spark.rasterframes.expressions.row import astraea.spark.rasterframes.ref.RasterRef import com.typesafe.scalalogging.LazyLogging import geotrellis.raster.Tile -import org.apache.spark.sql.{Column, TypedColumn} import org.apache.spark.sql.catalyst.expressions.codegen.CodegenFallback import org.apache.spark.sql.catalyst.expressions.{ExpectsInputTypes, Expression, UnaryExpression} import org.apache.spark.sql.rf._ import org.apache.spark.sql.types.DataType +import org.apache.spark.sql.{Column, TypedColumn} /** * Realizes a RasterRef into a Tile. @@ -47,6 +48,7 @@ case class RasterRefToTile(child: Expression) extends UnaryExpression override def dataType: DataType = new TileUDT override protected def nullSafeEval(input: Any): Any = { + implicit val ser = TileUDT.tileSerializer val ref = row(input).to[RasterRef] (ref.tile: Tile).toInternalRow } diff --git a/core/src/main/scala/astraea/spark/rasterframes/expressions/RasterSourceToRasterRefs.scala b/core/src/main/scala/astraea/spark/rasterframes/expressions/transformers/RasterSourceToRasterRefs.scala similarity index 89% rename from core/src/main/scala/astraea/spark/rasterframes/expressions/RasterSourceToRasterRefs.scala rename to core/src/main/scala/astraea/spark/rasterframes/expressions/transformers/RasterSourceToRasterRefs.scala index f754b8401..2581f8be5 100644 --- a/core/src/main/scala/astraea/spark/rasterframes/expressions/RasterSourceToRasterRefs.scala +++ b/core/src/main/scala/astraea/spark/rasterframes/expressions/transformers/RasterSourceToRasterRefs.scala @@ -1,7 +1,7 @@ /* * This software is licensed under the Apache 2 license, quoted below. * - * Copyright 2018 Astraea, Inc. + * Copyright 2019 Astraea, Inc. * * Licensed under the Apache License, Version 2.0 (the "License"); you may not * use this file except in compliance with the License. You may obtain a copy of @@ -19,19 +19,19 @@ * */ -package astraea.spark.rasterframes.expressions +package astraea.spark.rasterframes.expressions.transformers import astraea.spark.rasterframes.encoders.CatalystSerializer import astraea.spark.rasterframes.encoders.CatalystSerializer._ import astraea.spark.rasterframes.ref.RasterRef import astraea.spark.rasterframes.util._ import com.typesafe.scalalogging.LazyLogging -import org.apache.spark.sql.{Column, TypedColumn} import org.apache.spark.sql.catalyst.InternalRow import org.apache.spark.sql.catalyst.expressions._ import org.apache.spark.sql.catalyst.expressions.codegen.CodegenFallback import org.apache.spark.sql.rf._ import org.apache.spark.sql.types.{DataType, StructField, StructType} +import org.apache.spark.sql.{Column, TypedColumn} import scala.util.control.NonFatal @@ -44,20 +44,20 @@ import scala.util.control.NonFatal case class RasterSourceToRasterRefs(children: Seq[Expression], applyTiling: Boolean) extends Expression with Generator with CodegenFallback with ExpectsInputTypes with LazyLogging { - private val rasterSourceType = new RasterSourceUDT() + private val RasterSourceType = new RasterSourceUDT() private val rasterRefSchema = CatalystSerializer[RasterRef].schema - override def inputTypes: Seq[DataType] = Seq.fill(children.size)(rasterSourceType) + override def inputTypes: Seq[DataType] = Seq.fill(children.size)(RasterSourceType) override def nodeName: String = "raster_source_to_raster_ref" override def elementSchema: StructType = StructType( - children.map(e ⇒ StructField(e.name, rasterRefSchema, true)) + children.map(e ⇒ StructField(e.name, rasterRefSchema, false)) ) override def eval(input: InternalRow): TraversableOnce[InternalRow] = { try { val refs = children.map { child ⇒ - val src = rasterSourceType.deserialize(child.eval(input)) + val src = RasterSourceType.deserialize(child.eval(input)) if (applyTiling) src.nativeTiling.map(e ⇒ RasterRef(src, Some(e))) else Seq(RasterRef(src)) } refs.transpose.map(ts ⇒ InternalRow(ts.map(_.toInternalRow): _*)) diff --git a/core/src/main/scala/astraea/spark/rasterframes/expressions/RasterSourceToTiles.scala b/core/src/main/scala/astraea/spark/rasterframes/expressions/transformers/RasterSourceToTiles.scala similarity index 95% rename from core/src/main/scala/astraea/spark/rasterframes/expressions/RasterSourceToTiles.scala rename to core/src/main/scala/astraea/spark/rasterframes/expressions/transformers/RasterSourceToTiles.scala index 224c70823..2b1caa3ba 100644 --- a/core/src/main/scala/astraea/spark/rasterframes/expressions/RasterSourceToTiles.scala +++ b/core/src/main/scala/astraea/spark/rasterframes/expressions/transformers/RasterSourceToTiles.scala @@ -1,7 +1,7 @@ /* * This software is licensed under the Apache 2 license, quoted below. * - * Copyright 2018 Astraea, Inc. + * Copyright 2019 Astraea, Inc. * * Licensed under the Apache License, Version 2.0 (the "License"); you may not * use this file except in compliance with the License. You may obtain a copy of @@ -19,7 +19,7 @@ * */ -package astraea.spark.rasterframes.expressions +package astraea.spark.rasterframes.expressions.transformers import astraea.spark.rasterframes.encoders.CatalystSerializer._ import astraea.spark.rasterframes.util._ @@ -53,6 +53,8 @@ case class RasterSourceToTiles(children: Seq[Expression], applyTiling: Boolean) ) override def eval(input: InternalRow): TraversableOnce[InternalRow] = { + implicit val ser = TileUDT.tileSerializer + try { val refs = children.map { child ⇒ val src = RasterSourceType.deserialize(child.eval(input)) diff --git a/core/src/main/scala/astraea/spark/rasterframes/expressions/ReprojectGeometry.scala b/core/src/main/scala/astraea/spark/rasterframes/expressions/transformers/ReprojectGeometry.scala similarity index 98% rename from core/src/main/scala/astraea/spark/rasterframes/expressions/ReprojectGeometry.scala rename to core/src/main/scala/astraea/spark/rasterframes/expressions/transformers/ReprojectGeometry.scala index e10cd323a..7e78c5942 100644 --- a/core/src/main/scala/astraea/spark/rasterframes/expressions/ReprojectGeometry.scala +++ b/core/src/main/scala/astraea/spark/rasterframes/expressions/transformers/ReprojectGeometry.scala @@ -19,11 +19,11 @@ * */ -package astraea.spark.rasterframes.expressions +package astraea.spark.rasterframes.expressions.transformers import astraea.spark.rasterframes._ -import astraea.spark.rasterframes.encoders.{CatalystSerializer, serialized_literal} import astraea.spark.rasterframes.encoders.CatalystSerializer._ +import astraea.spark.rasterframes.encoders.{CatalystSerializer, serialized_literal} import astraea.spark.rasterframes.jts.ReprojectionTransformer import com.vividsolutions.jts.geom.Geometry import geotrellis.proj4.CRS diff --git a/core/src/main/scala/astraea/spark/rasterframes/expressions/SetCellType.scala b/core/src/main/scala/astraea/spark/rasterframes/expressions/transformers/SetCellType.scala similarity index 64% rename from core/src/main/scala/astraea/spark/rasterframes/expressions/SetCellType.scala rename to core/src/main/scala/astraea/spark/rasterframes/expressions/transformers/SetCellType.scala index d0bc0d3af..96fcd4288 100644 --- a/core/src/main/scala/astraea/spark/rasterframes/expressions/SetCellType.scala +++ b/core/src/main/scala/astraea/spark/rasterframes/expressions/transformers/SetCellType.scala @@ -1,7 +1,7 @@ /* * This software is licensed under the Apache 2 license, quoted below. * - * Copyright 2018 Astraea, Inc. + * Copyright 2019 Astraea, Inc. * * Licensed under the Apache License, Version 2.0 (the "License"); you may not * use this file except in compliance with the License. You may obtain a copy of @@ -19,11 +19,13 @@ * */ -package astraea.spark.rasterframes.expressions +package astraea.spark.rasterframes.expressions.transformers import astraea.spark.rasterframes.encoders.CatalystSerializer import astraea.spark.rasterframes.encoders.CatalystSerializer._ import astraea.spark.rasterframes.encoders.StandardEncoders._ +import astraea.spark.rasterframes.expressions.DynamicExtractors.tileExtractor +import astraea.spark.rasterframes.expressions.row import geotrellis.raster.{CellType, Tile} import org.apache.spark.sql.catalyst.InternalRow import org.apache.spark.sql.catalyst.analysis.TypeCheckResult @@ -41,50 +43,54 @@ import org.apache.spark.unsafe.types.UTF8String * * @since 9/11/18 */ -case class SetCellType(tile: Expression, cellType: Expression) extends BinaryExpression with CodegenFallback { +case class SetCellType(tile: Expression, cellType: Expression) + extends BinaryExpression with CodegenFallback { def left = tile def right = cellType override def nodeName: String = "set_cell_type" - override def dataType: DataType = new TileUDT() + override def dataType: DataType = left.dataType private val ctSchema = CatalystSerializer[CellType].schema override def checkInputDataTypes(): TypeCheckResult = { - RequiresTile.check(tile) match { - case TypeCheckSuccess ⇒ - right.dataType match { - case StringType ⇒ TypeCheckSuccess - case st: StructType if st == ctSchema ⇒ TypeCheckSuccess - case _ ⇒ TypeCheckFailure( - s"Expected CellType but received '${right.dataType.simpleString}'" - ) - } - case o ⇒ o - } + if (!tileExtractor.isDefinedAt(left.dataType)) + TypeCheckFailure(s"Input type '${left.dataType}' does not conform to a raster type.") + else + right.dataType match { + case StringType => TypeCheckSuccess + case t if t.conformsTo(ctSchema) => TypeCheckSuccess + case _ => + TypeCheckFailure(s"Expected CellType but received '${right.dataType.simpleString}'") + } } private def toCellType(datum: Any): CellType = { right.dataType match { - case StringType ⇒ + case StringType => val text = datum.asInstanceOf[UTF8String].toString CellType.fromName(text) - case st: StructType if st == ctSchema ⇒ + case st if st.conformsTo(ctSchema) => row(datum).to[CellType] } } - override protected def nullSafeEval(left: Any, right: Any): InternalRow = { - val t = row(left).to[Tile] - val ct = toCellType(right) - t.convert(ct).toInternalRow + override protected def nullSafeEval(tileInput: Any, ctInput: Any): InternalRow = { + implicit val tileSer = TileUDT.tileSerializer + + val (tile, ctx) = tileExtractor(left.dataType)(row(tileInput)) + val ct = toCellType(ctInput) + val result = tile.convert(ct) + + ctx match { + case Some(c) => c.toProjectRasterTile(result).toInternalRow + case None => result.toInternalRow + } } } object SetCellType { - def apply(tile: Column, cellType: CellType): TypedColumn[Any, Tile] = new Column(new SetCellType(tile.expr, lit(cellType.name).expr)).as[Tile] def apply(tile: Column, cellType: String): TypedColumn[Any, Tile] = new Column(new SetCellType(tile.expr, lit(cellType).expr)).as[Tile] - } diff --git a/core/src/main/scala/astraea/spark/rasterframes/expressions/transformers/TileToArrayDouble.scala b/core/src/main/scala/astraea/spark/rasterframes/expressions/transformers/TileToArrayDouble.scala new file mode 100644 index 000000000..02a4bc4e8 --- /dev/null +++ b/core/src/main/scala/astraea/spark/rasterframes/expressions/transformers/TileToArrayDouble.scala @@ -0,0 +1,49 @@ +/* + * This software is licensed under the Apache 2 license, quoted below. + * + * Copyright 2019 Astraea, Inc. + * + * Licensed under the Apache License, Version 2.0 (the "License"); you may not + * use this file except in compliance with the License. You may obtain a copy of + * the License at + * + * [http://www.apache.org/licenses/LICENSE-2.0] + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT + * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the + * License for the specific language governing permissions and limitations under + * the License. + * + * SPDX-License-Identifier: Apache-2.0 + * + */ + +package astraea.spark.rasterframes.expressions.transformers +import astraea.spark.rasterframes.expressions.UnaryRasterOp +import astraea.spark.rasterframes.model.TileContext +import geotrellis.raster.Tile +import org.apache.spark.sql.catalyst.expressions.{Expression, ExpressionDescription} +import org.apache.spark.sql.catalyst.expressions.codegen.CodegenFallback +import org.apache.spark.sql.catalyst.util.ArrayData +import org.apache.spark.sql.types.{DataType, DataTypes, DoubleType} +import org.apache.spark.sql.{Column, TypedColumn} + +@ExpressionDescription( + usage = "_FUNC_(tile) - Coverts the contents of the given tile to an array of double floating-point values", + arguments = """ + Arguments: + * tile - tile to convert""" +) +case class TileToArrayDouble(child: Expression) extends UnaryRasterOp with CodegenFallback { + override def nodeName: String = "tile_to_array_double" + override def dataType: DataType = DataTypes.createArrayType(DoubleType, false) + override protected def eval(tile: Tile, ctx: Option[TileContext]): Any = { + ArrayData.toArrayData(tile.toArrayDouble()) + } +} +object TileToArrayDouble { + import astraea.spark.rasterframes.encoders.StandardEncoders.PrimitiveEncoders.arrayEnc + def apply(tile: Column): TypedColumn[Any, Array[Double]] = + new Column(TileToArrayDouble(tile.expr)).as[Array[Double]] +} diff --git a/core/src/main/scala/astraea/spark/rasterframes/expressions/transformers/TileToArrayInt.scala b/core/src/main/scala/astraea/spark/rasterframes/expressions/transformers/TileToArrayInt.scala new file mode 100644 index 000000000..31ad81516 --- /dev/null +++ b/core/src/main/scala/astraea/spark/rasterframes/expressions/transformers/TileToArrayInt.scala @@ -0,0 +1,50 @@ +/* + * This software is licensed under the Apache 2 license, quoted below. + * + * Copyright 2019 Astraea, Inc. + * + * Licensed under the Apache License, Version 2.0 (the "License"); you may not + * use this file except in compliance with the License. You may obtain a copy of + * the License at + * + * [http://www.apache.org/licenses/LICENSE-2.0] + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT + * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the + * License for the specific language governing permissions and limitations under + * the License. + * + * SPDX-License-Identifier: Apache-2.0 + * + */ + +package astraea.spark.rasterframes.expressions.transformers + +import astraea.spark.rasterframes.expressions.UnaryRasterOp +import astraea.spark.rasterframes.model.TileContext +import geotrellis.raster.Tile +import org.apache.spark.sql.catalyst.expressions.codegen.CodegenFallback +import org.apache.spark.sql.catalyst.expressions.{Expression, ExpressionDescription} +import org.apache.spark.sql.catalyst.util.ArrayData +import org.apache.spark.sql.types.{DataType, DataTypes, IntegerType} +import org.apache.spark.sql.{Column, TypedColumn} + +@ExpressionDescription( + usage = "_FUNC_(tile) - Coverts the contents of the given tile to an array of integer values", + arguments = """ + Arguments: + * tile - tile to convert""" +) +case class TileToArrayInt(child: Expression) extends UnaryRasterOp with CodegenFallback { + override def nodeName: String = "tile_to_array_int" + override def dataType: DataType = DataTypes.createArrayType(IntegerType, false) + override protected def eval(tile: Tile, ctx: Option[TileContext]): Any = { + ArrayData.toArrayData(tile.toArray()) + } +} +object TileToArrayInt { + import astraea.spark.rasterframes.encoders.StandardEncoders.PrimitiveEncoders.arrayEnc + def apply(tile: Column): TypedColumn[Any, Array[Int]] = + new Column(TileToArrayInt(tile.expr)).as[Array[Int]] +} diff --git a/core/src/main/scala/astraea/spark/rasterframes/expressions/URIToRasterSource.scala b/core/src/main/scala/astraea/spark/rasterframes/expressions/transformers/URIToRasterSource.scala similarity index 93% rename from core/src/main/scala/astraea/spark/rasterframes/expressions/URIToRasterSource.scala rename to core/src/main/scala/astraea/spark/rasterframes/expressions/transformers/URIToRasterSource.scala index a857324eb..0821e43db 100644 --- a/core/src/main/scala/astraea/spark/rasterframes/expressions/URIToRasterSource.scala +++ b/core/src/main/scala/astraea/spark/rasterframes/expressions/transformers/URIToRasterSource.scala @@ -1,7 +1,7 @@ /* * This software is licensed under the Apache 2 license, quoted below. * - * Copyright 2018 Astraea, Inc. + * Copyright 2019 Astraea, Inc. * * Licensed under the Apache License, Version 2.0 (the "License"); you may not * use this file except in compliance with the License. You may obtain a copy of @@ -19,11 +19,10 @@ * */ -package astraea.spark.rasterframes.expressions +package astraea.spark.rasterframes.expressions.transformers import java.net.URI -import astraea.spark.rasterframes.encoders.CatalystSerializer._ import astraea.spark.rasterframes.ref.RasterSource.ReadCallback import astraea.spark.rasterframes.ref.{RasterRef, RasterSource} import com.typesafe.scalalogging.LazyLogging @@ -54,7 +53,7 @@ case class URIToRasterSource(override val child: Expression, accumulator: Option val uriString = input.asInstanceOf[UTF8String].toString val uri = URI.create(uriString) val ref = RasterSource(uri, accumulator) - ref.toInternalRow + RasterSourceUDT.serialize(ref) } } diff --git a/core/src/main/scala/astraea/spark/rasterframes/extensions/DataFrameMethods.scala b/core/src/main/scala/astraea/spark/rasterframes/extensions/DataFrameMethods.scala index 8f8080c13..ca38322ac 100644 --- a/core/src/main/scala/astraea/spark/rasterframes/extensions/DataFrameMethods.scala +++ b/core/src/main/scala/astraea/spark/rasterframes/extensions/DataFrameMethods.scala @@ -29,7 +29,7 @@ import org.apache.spark.sql.rf.TileUDT import org.apache.spark.sql.types.{MetadataBuilder, StructField} import org.apache.spark.sql.{Column, DataFrame, TypedColumn} import spray.json.JsonFormat - +import astraea.spark.rasterframes.encoders.StandardEncoders._ import scala.util.Try /** diff --git a/core/src/main/scala/astraea/spark/rasterframes/extensions/RasterFrameMethods.scala b/core/src/main/scala/astraea/spark/rasterframes/extensions/RasterFrameMethods.scala index fd658b596..e83e55fd3 100644 --- a/core/src/main/scala/astraea/spark/rasterframes/extensions/RasterFrameMethods.scala +++ b/core/src/main/scala/astraea/spark/rasterframes/extensions/RasterFrameMethods.scala @@ -33,7 +33,8 @@ import org.apache.spark.sql._ import org.apache.spark.sql.functions._ import org.apache.spark.sql.types.{Metadata, TimestampType} import spray.json._ - +import astraea.spark.rasterframes.encoders.StandardEncoders._ +import astraea.spark.rasterframes.encoders.StandardEncoders.PrimitiveEncoders._ import scala.reflect.runtime.universe._ /** @@ -44,10 +45,6 @@ trait RasterFrameMethods extends MethodExtensions[RasterFrame] with RFSpatialColumnMethods with MetadataKeys with LazyLogging { import Implicits.{WithDataFrameMethods, WithRasterFrameMethods} - private val _stableDF = self - import _stableDF.sqlContext.implicits._ - - /** * A convenience over `DataFrame.withColumnRenamed` whereby the `RasterFrame` type is maintained. */ @@ -190,6 +187,7 @@ trait RasterFrameMethods extends MethodExtensions[RasterFrame] def updateBounds[T: SpatialComponent: Boundable: JsonFormat: TypeTag](tlm: TileLayerMetadata[T], keys: Dataset[T]): DataFrame = { + implicit val enc = Encoders.product[KeyBounds[T]] val keyBounds = keys .map(k ⇒ KeyBounds(k, k)) .reduce(_ combine _) @@ -238,6 +236,7 @@ trait RasterFrameMethods extends MethodExtensions[RasterFrame] def toMultibandTileLayerRDD(tileCols: Column*): Either[MultibandTileLayerRDD[SpatialKey], MultibandTileLayerRDD[SpaceTimeKey]] = tileLayerMetadata.fold( tlm ⇒ { + implicit val genEnc = expressionEncoder[(SpatialKey, Array[Tile])] val rdd = self .select(self.spatialKeyColumn, array(tileCols: _*)).as[(SpatialKey, Array[Tile])] .rdd @@ -247,6 +246,7 @@ trait RasterFrameMethods extends MethodExtensions[RasterFrame] Left(ContextRDD(rdd, tlm)) }, tlm ⇒ { + implicit val genEnc = expressionEncoder[(SpatialKey, TemporalKey, Array[Tile])] val rdd = self .select(self.spatialKeyColumn, self.temporalKeyColumn.get, array(tileCols: _*)).as[(SpatialKey, TemporalKey, Array[Tile])] .rdd @@ -259,48 +259,6 @@ trait RasterFrameMethods extends MethodExtensions[RasterFrame] private[rasterframes] def extract[M: JsonFormat](metadataKey: String)(md: Metadata) = md.getMetadata(metadataKey).json.parseJson.convertTo[M] - // TODO: Take care of DRY below -// private def rasterize[T <: CellGrid: TypeTag]( -// tileCols: Seq[Column], -// rasterCols: Int, -// rasterRows: Int, -// resampler: ResampleMethod): ProjectedRaster[T] = { -// -// val clipped = clipLayerExtent -// -// val md = clipped.tileLayerMetadata.widen -// val newLayout = LayoutDefinition(md.extent, TileLayout(1, 1, rasterCols, rasterRows)) -// -// val trans = md.mapTransform -// -// //val cell_type = rdd.first()._2.cell_type -// val keyBounds = Bounds(SpatialKey(0, 0), SpatialKey(0, 0)) -// val newLayerMetadata = -// md.copy(layout = newLayout, bounds = keyBounds) -// -// -// val newLayer = typeOf[T] match { -// case tpe if tpe <:< typeOf[Tile] ⇒ -// val r = clipped.toTileLayerRDD(tileCols.head) -// .fold(identity, _.map { case (stk, t) ⇒ (stk.spatialKey, t) }) // <-- Drops the temporal key outright -// .map { case (key, tile) ⇒ (ProjectedExtent(trans(key), md.crs), tile) } -// ContextRDD(r, md) -// .tileToLayout(newLayerMetadata, Tiler.Options(resampler)) -// case tpe if tpe <:< typeOf[MultibandTile] ⇒ -// val r = clipped.toMultibandTileLayerRDD(tileCols: _*) -// .fold(identity, _.map { case (stk, t) ⇒ (stk.spatialKey, t) }) // <-- Drops the temporal key outright -// .map { case (key, tile) ⇒ (ProjectedExtent(trans(key), md.crs), tile) } -// ContextRDD(r, md) -// .tileToLayout(newLayerMetadata, Tiler.Options(resampler)) -// } -// -// val stitchedTile = newLayer.stitch() -// -// val croppedTile = stitchedTile.crop(rasterCols, rasterRows) -// -// ProjectedRaster(croppedTile, md.extent, md.crs) -// } - /** Convert the tiles in the RasterFrame into a single raster. For RasterFrames keyed with temporal keys, they * will be merge undeterministically. */ def toRaster(tileCol: Column, diff --git a/core/src/main/scala/astraea/spark/rasterframes/functions/CellCountAggregate.scala b/core/src/main/scala/astraea/spark/rasterframes/functions/CellCountAggregate.scala deleted file mode 100644 index 856b367c6..000000000 --- a/core/src/main/scala/astraea/spark/rasterframes/functions/CellCountAggregate.scala +++ /dev/null @@ -1,81 +0,0 @@ -/* - * This software is licensed under the Apache 2 license, quoted below. - * - * Copyright 2017 Astraea, Inc. - * - * Licensed under the Apache License, Version 2.0 (the "License"); you may not - * use this file except in compliance with the License. You may obtain a copy of - * the License at - * - * [http://www.apache.org/licenses/LICENSE-2.0] - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT - * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the - * License for the specific language governing permissions and limitations under - * the License. - * - */ - -package astraea.spark.rasterframes.functions - -import org.apache.spark.sql.{Column, TypedColumn} -import org.apache.spark.sql.catalyst.dsl.expressions._ -import org.apache.spark.sql.catalyst.expressions.aggregate.DeclarativeAggregate -import org.apache.spark.sql.catalyst.expressions.{AttributeReference, Expression, _} -import org.apache.spark.sql.functions._ -import org.apache.spark.sql.rf.{TileUDT, _} -import org.apache.spark.sql.types.{LongType, Metadata} - -/** - * Cell count (data or NoData) aggregate function. - * - * @since 10/5/17 - * @param isData true if count should be of non-NoData cells, false if count should be of NoData cells. - */ -case class CellCountAggregate(isData: Boolean, child: Expression) extends DeclarativeAggregate { - - override def prettyName: String = - if (isData) "agg_data_cells" - else "agg_no_data_cells" - - private lazy val count = - AttributeReference("count", LongType, false, Metadata.empty)() - - override lazy val aggBufferAttributes = count :: Nil - - val initialValues = Seq( - Literal(0L) - ) - - private val cellTest = - if (isData) udf(dataCells) - else udf(noDataCells) - - val updateExpressions = Seq( - If(IsNull(child), count, Add(count, cellTest(new Column(child)).expr)) - ) - - val mergeExpressions = Seq( - count.left + count.right - ) - - val evaluateExpression = count - - def inputTypes = Seq(TileUDT) - - def nullable = true - - def dataType = LongType - - def children = Seq(child) -} - -object CellCountAggregate { - import astraea.spark.rasterframes.encoders.SparkDefaultEncoders._ - def apply(isData: Boolean, tile: Column): TypedColumn[Any, Long] = - new Column(new CellCountAggregate(isData, tile.expr).toAggregateExpression()).as[Long] -} - - - diff --git a/core/src/main/scala/astraea/spark/rasterframes/functions/CellStatsAggregate.scala b/core/src/main/scala/astraea/spark/rasterframes/functions/CellStatsAggregate.scala deleted file mode 100644 index f45e7e0cb..000000000 --- a/core/src/main/scala/astraea/spark/rasterframes/functions/CellStatsAggregate.scala +++ /dev/null @@ -1,130 +0,0 @@ -/* - * Copyright 2017 Astraea, Inc. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package astraea.spark.rasterframes.functions - -import geotrellis.raster.{Tile, _} -import org.apache.spark.sql.Row -import org.apache.spark.sql.expressions.{MutableAggregationBuffer, UserDefinedAggregateFunction} -import org.apache.spark.sql.rf.TileUDT -import org.apache.spark.sql.types.{DataType, _} - -/** - * Statistics aggregation function for a full column of tiles. - * - * @since 4/17/17 - */ -case class CellStatsAggregate() extends UserDefinedAggregateFunction { - import CellStatsAggregate.C - - private val TileType = new TileUDT() - - override def inputSchema: StructType = StructType(StructField("value", TileType) :: Nil) - - override def dataType: DataType = - StructType( - Seq( - StructField("dataCells", LongType), - StructField("noDataCells", LongType), - StructField("min", DoubleType), - StructField("max", DoubleType), - StructField("mean", DoubleType), - StructField("variance", DoubleType) - ) - ) - - override def bufferSchema: StructType = - StructType( - Seq( - StructField("dataCells", LongType), - StructField("noDataCells", LongType), - StructField("min", DoubleType), - StructField("max", DoubleType), - StructField("sum", DoubleType), - StructField("sumSqr", DoubleType) - ) - ) - - override def deterministic: Boolean = true - - override def initialize(buffer: MutableAggregationBuffer): Unit = { - buffer(C.COUNT) = 0L - buffer(C.NODATA) = 0L - buffer(C.MIN) = Double.MaxValue - buffer(C.MAX) = Double.MinValue - buffer(C.SUM) = 0.0 - buffer(C.SUM_SQRS) = 0.0 - } - - override def update(buffer: MutableAggregationBuffer, input: Row): Unit = { - if(!input.isNullAt(0)) { - val tile = input.getAs[Tile](0) - var count = buffer.getLong(C.COUNT) - var nodata = buffer.getLong(C.NODATA) - var min = buffer.getDouble(C.MIN) - var max = buffer.getDouble(C.MAX) - var sum = buffer.getDouble(C.SUM) - var sumSqr = buffer.getDouble(C.SUM_SQRS) - - tile.foreachDouble(c ⇒ if (isData(c)) { - count += 1 - min = math.min(min, c) - max = math.max(max, c) - sum = sum + c - sumSqr = sumSqr + c * c - } - else nodata += 1 - ) - - buffer(C.COUNT) = count - buffer(C.NODATA) = nodata - buffer(C.MIN) = min - buffer(C.MAX) = max - buffer(C.SUM) = sum - buffer(C.SUM_SQRS) = sumSqr - } - } - - override def merge(buffer1: MutableAggregationBuffer, buffer2: Row): Unit = { - buffer1(C.COUNT) = buffer1.getLong(C.COUNT) + buffer2.getLong(C.COUNT) - buffer1(C.NODATA) = buffer1.getLong(C.NODATA) + buffer2.getLong(C.NODATA) - buffer1(C.MIN) = math.min(buffer1.getDouble(C.MIN), buffer2.getDouble(C.MIN)) - buffer1(C.MAX) = math.max(buffer1.getDouble(C.MAX), buffer2.getDouble(C.MAX)) - buffer1(C.SUM) = buffer1.getDouble(C.SUM) + buffer2.getDouble(C.SUM) - buffer1(C.SUM_SQRS) = buffer1.getDouble(C.SUM_SQRS) + buffer2.getDouble(C.SUM_SQRS) - } - - override def evaluate(buffer: Row): Any = { - val count = buffer.getLong(C.COUNT) - val sum = buffer.getDouble(C.SUM) - val sumSqr = buffer.getDouble(C.SUM_SQRS) - val mean = sum / count - val variance = sumSqr / count - mean * mean - Row(count, buffer(C.NODATA), buffer(C.MIN), buffer(C.MAX), mean, variance) - } -} - -object CellStatsAggregate { - /** Column index values. */ - private object C { - final val COUNT = 0 - final val NODATA = 1 - final val MIN = 2 - final val MAX = 3 - final val SUM = 4 - final val SUM_SQRS = 5 - } -} diff --git a/core/src/main/scala/astraea/spark/rasterframes/functions/HistogramAggregate.scala b/core/src/main/scala/astraea/spark/rasterframes/functions/HistogramAggregate.scala deleted file mode 100644 index 972ae2b73..000000000 --- a/core/src/main/scala/astraea/spark/rasterframes/functions/HistogramAggregate.scala +++ /dev/null @@ -1,86 +0,0 @@ -/* - * Copyright 2017 Astraea, Inc. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package astraea.spark.rasterframes.functions - -import java.nio.ByteBuffer - -import astraea.spark.rasterframes.encoders.StandardEncoders -import astraea.spark.rasterframes.stats.CellHistogram -import geotrellis.raster.Tile -import geotrellis.raster.histogram.{Histogram, StreamingHistogram} -import geotrellis.spark.util.KryoSerializer -import org.apache.spark.sql.Row -import org.apache.spark.sql.expressions.{MutableAggregationBuffer, UserDefinedAggregateFunction} -import org.apache.spark.sql.rf.TileUDT -import org.apache.spark.sql.types._ - -/** - * Histogram aggregation function for a full column of tiles. - * - * @since 4/24/17 - */ -case class HistogramAggregate(numBuckets: Int) extends UserDefinedAggregateFunction { - def this() = this(StreamingHistogram.DEFAULT_NUM_BUCKETS) - - private val TileType = new TileUDT() - - override def inputSchema: StructType = StructType(StructField("value", TileType) :: Nil) - - override def bufferSchema: StructType = StructType(StructField("buffer", BinaryType) :: Nil) - - override def dataType: DataType = StandardEncoders.histEncoder.schema - - override def deterministic: Boolean = true - - @transient - private lazy val ser = KryoSerializer.ser.newInstance() - - @inline - private def marshall(hist: Histogram[Double]): Array[Byte] = ser.serialize(hist).array() - - @inline - private def unmarshall(blob: Array[Byte]): Histogram[Double] = ser.deserialize(ByteBuffer.wrap(blob)) - - override def initialize(buffer: MutableAggregationBuffer): Unit = - buffer(0) = marshall(StreamingHistogram(numBuckets)) - - private val safeMerge = safeEval((h1: Histogram[Double], h2: Histogram[Double]) ⇒ h1 merge h2) - - override def update(buffer: MutableAggregationBuffer, input: Row): Unit = { - val tile = input.getAs[Tile](0) - val hist1 = unmarshall(buffer.getAs[Array[Byte]](0)) - val hist2 = safeEval(StreamingHistogram.fromTile(_: Tile, numBuckets))(tile) - val updatedHist = safeMerge(hist1, hist2) - buffer(0) = marshall(updatedHist) - } - - override def merge(buffer1: MutableAggregationBuffer, buffer2: Row): Unit = { - val hist1 = unmarshall(buffer1.getAs[Array[Byte]](0)) - val hist2 = unmarshall(buffer2.getAs[Array[Byte]](0)) - val updatedHist = safeMerge(hist1, hist2) - buffer1(0) = marshall(updatedHist) - } - - override def evaluate(buffer: Row): Any = { - val hist = unmarshall(buffer.getAs[Array[Byte]](0)) - CellHistogram(hist) - } -} - -object HistogramAggregate { - def apply() = new HistogramAggregate(StreamingHistogram.DEFAULT_NUM_BUCKETS) -} diff --git a/core/src/main/scala/astraea/spark/rasterframes/functions/LocalCountAggregate.scala b/core/src/main/scala/astraea/spark/rasterframes/functions/LocalCountAggregate.scala deleted file mode 100644 index 4f06b69c1..000000000 --- a/core/src/main/scala/astraea/spark/rasterframes/functions/LocalCountAggregate.scala +++ /dev/null @@ -1,56 +0,0 @@ -package astraea.spark.rasterframes.functions - -import geotrellis.raster.mapalgebra.local.{Add, Defined, Undefined} -import geotrellis.raster.{IntConstantNoDataCellType, Tile} -import org.apache.spark.sql.Row -import org.apache.spark.sql.expressions.{MutableAggregationBuffer, UserDefinedAggregateFunction} -import org.apache.spark.sql.rf.TileUDT -import org.apache.spark.sql.types.{DataType, StructField, StructType} - -/** - * Catalyst aggregate function that counts `NoData` values in a cell-wise fashion. - * - * @param isData true if count should be of non-NoData values, false for NoData values. - * @since 8/11/17 - */ -class LocalCountAggregate(isData: Boolean) extends UserDefinedAggregateFunction { - - private val incCount = - if (isData) safeBinaryOp((t1: Tile, t2: Tile) ⇒ Add(t1, Defined(t2))) - else safeBinaryOp((t1: Tile, t2: Tile) ⇒ Add(t1, Undefined(t2))) - - private val add = safeBinaryOp(Add.apply(_: Tile, _: Tile)) - - private val TileType = new TileUDT() - - override def dataType: DataType = TileType - - override def inputSchema: StructType = StructType(StructField("value", TileType) :: Nil) - - override def bufferSchema: StructType = inputSchema - - override def deterministic: Boolean = true - - override def initialize(buffer: MutableAggregationBuffer): Unit = - buffer(0) = null - - override def update(buffer: MutableAggregationBuffer, input: Row): Unit = { - val right = input.getAs[Tile](0) - if (right != null) { - if (buffer(0) == null) { - buffer(0) = ( - if (isData) Defined(right) else Undefined(right) - ).convert(IntConstantNoDataCellType) - } else { - val left = buffer.getAs[Tile](0) - buffer(0) = incCount(left, right) - } - } - } - - override def merge(buffer1: MutableAggregationBuffer, buffer2: Row): Unit = { - buffer1(0) = add(buffer1.getAs[Tile](0), buffer2.getAs[Tile](0)) - } - - override def evaluate(buffer: Row): Tile = buffer.getAs[Tile](0) -} diff --git a/core/src/main/scala/astraea/spark/rasterframes/functions/LocalMeanAggregate.scala b/core/src/main/scala/astraea/spark/rasterframes/functions/LocalMeanAggregate.scala deleted file mode 100644 index 60d51457c..000000000 --- a/core/src/main/scala/astraea/spark/rasterframes/functions/LocalMeanAggregate.scala +++ /dev/null @@ -1,20 +0,0 @@ -package astraea.spark.rasterframes.functions - -import org.apache.spark.sql.Row -import org.apache.spark.sql.rf.TileUDT -import org.apache.spark.sql.types.DataType - -/** - * Aggregation function that only returns the average. Depends on - * [[LocalStatsAggregate]] for computation and just - * selects the mean result tile. - * - * @since 8/11/17 - */ -class LocalMeanAggregate extends LocalStatsAggregate { - override def dataType: DataType = new TileUDT() - override def evaluate(buffer: Row): Any = { - val superRow = super.evaluate(buffer).asInstanceOf[Row] - if (superRow != null) superRow.get(3) else null - } -} diff --git a/core/src/main/scala/astraea/spark/rasterframes/functions/LocalTileOpAggregate.scala b/core/src/main/scala/astraea/spark/rasterframes/functions/LocalTileOpAggregate.scala deleted file mode 100644 index 8ae5eadad..000000000 --- a/core/src/main/scala/astraea/spark/rasterframes/functions/LocalTileOpAggregate.scala +++ /dev/null @@ -1,61 +0,0 @@ -/* - * Copyright 2017 Astraea, Inc. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package astraea.spark.rasterframes.functions - -import geotrellis.raster.Tile -import geotrellis.raster.mapalgebra.local.LocalTileBinaryOp -import org.apache.spark.sql.Row -import org.apache.spark.sql.expressions.{MutableAggregationBuffer, UserDefinedAggregateFunction} -import org.apache.spark.sql.rf.TileUDT -import org.apache.spark.sql.types._ - -/** - * Aggregation function for applying a [[LocalTileBinaryOp]] pairwise across all tiles. Assumes Monoid algebra. - * - * @since 4/17/17 - */ -class LocalTileOpAggregate(op: LocalTileBinaryOp) extends UserDefinedAggregateFunction { - - private val safeOp = safeBinaryOp(op.apply(_: Tile, _: Tile)) - - private val TileType = new TileUDT() - - override def inputSchema: StructType = StructType(StructField("value", TileType) :: Nil) - - override def bufferSchema: StructType = inputSchema - - override def dataType: DataType = TileType - - override def deterministic: Boolean = true - - override def initialize(buffer: MutableAggregationBuffer): Unit = - buffer(0) = null - - override def update(buffer: MutableAggregationBuffer, input: Row): Unit = { - if (buffer(0) == null) { - buffer(0) = input(0) - } else { - val t1 = buffer.getAs[Tile](0) - val t2 = input.getAs[Tile](0) - buffer(0) = safeOp(t1, t2) - } - } - - override def merge(buffer1: MutableAggregationBuffer, buffer2: Row): Unit = update(buffer1, buffer2) - - override def evaluate(buffer: Row): Tile = buffer.getAs[Tile](0) -} diff --git a/core/src/main/scala/astraea/spark/rasterframes/functions/package.scala b/core/src/main/scala/astraea/spark/rasterframes/functions/package.scala index e33570d64..060b08fa3 100644 --- a/core/src/main/scala/astraea/spark/rasterframes/functions/package.scala +++ b/core/src/main/scala/astraea/spark/rasterframes/functions/package.scala @@ -15,19 +15,15 @@ */ package astraea.spark.rasterframes +import astraea.spark.rasterframes.expressions.aggstats._ import astraea.spark.rasterframes.jts.ReprojectionTransformer -import astraea.spark.rasterframes.stats.{CellHistogram, CellStatistics} import astraea.spark.rasterframes.util.CRSParser import com.vividsolutions.jts.geom.Geometry -import geotrellis.proj4.CRS import geotrellis.raster.mapalgebra.local._ -import geotrellis.raster.render.ascii.AsciiArtEncoder import geotrellis.raster.{Tile, _} import geotrellis.vector.Extent import org.apache.spark.sql.SQLContext -import scala.reflect.runtime.universe._ - /** * Module utils. * @@ -55,80 +51,6 @@ package object functions { private[rasterframes] def safeEval[P1, P2, R](f: (P1, P2) ⇒ R): (P1, P2) ⇒ R = (p1, p2) ⇒ if (p1 == null || p2 == null) null.asInstanceOf[R] else f(p1, p2) - - /** Count tile cells that have a data value. */ - private[rasterframes] val dataCells: (Tile) ⇒ Long = safeEval((t: Tile) ⇒ { - var count: Long = 0 - t.dualForeach( - z ⇒ if(isData(z)) count = count + 1 - ) ( - z ⇒ if(isData(z)) count = count + 1 - ) - count - }) - - /** Count tile cells that have a no-data value. */ - private[rasterframes] val noDataCells: (Tile) ⇒ Long = safeEval((t: Tile) ⇒ { - var count: Long = 0 - t.dualForeach( - z ⇒ if(isNoData(z)) count = count + 1 - )( - z ⇒ if(isNoData(z)) count = count + 1 - ) - count - }) - - private[rasterframes] val isNoDataTile: (Tile) ⇒ Boolean = (t: Tile) ⇒ { - if(t == null) true - else t.isNoDataTile - } - - /** Flattens tile into an array. */ - private[rasterframes] def tileToArray[T: HasCellType: TypeTag]: (Tile) ⇒ Array[T] = { - def convert(tile: Tile) = { - typeOf[T] match { - case t if t =:= typeOf[Int] ⇒ tile.toArray() - case t if t =:= typeOf[Double] ⇒ tile.toArrayDouble() - case t if t =:= typeOf[Byte] ⇒ tile.toArray().map(_.toByte) // TODO: Check NoData handling. probably need to use dualForeach - case t if t =:= typeOf[Short] ⇒ tile.toArray().map(_.toShort) - case t if t =:= typeOf[Float] ⇒ tile.toArrayDouble().map(_.toFloat) - } - } - - safeEval[Tile, Array[T]] { t ⇒ - val tile = t match { - case c: ConstantTile ⇒ c.toArrayTile() - case o ⇒ o - } - val asArray: Array[_] = tile match { - case t: IntArrayTile ⇒ - if (typeOf[T] =:= typeOf[Int]) t.array - else convert(t) - case t: DoubleArrayTile ⇒ - if (typeOf[T] =:= typeOf[Double]) t.array - else convert(t) - case t: ByteArrayTile ⇒ - if (typeOf[T] =:= typeOf[Byte]) t.array - else convert(t) - case t: UByteArrayTile ⇒ - if (typeOf[T] =:= typeOf[Byte]) t.array - else convert(t) - case t: ShortArrayTile ⇒ - if (typeOf[T] =:= typeOf[Short]) t.array - else convert(t) - case t: UShortArrayTile ⇒ - if (typeOf[T] =:= typeOf[Short]) t.array - else convert(t) - case t: FloatArrayTile ⇒ - if (typeOf[T] =:= typeOf[Float]) t.array - else convert(t) - case _: Tile ⇒ - throw new IllegalArgumentException("Unsupported tile type: " + tile.getClass) - } - asArray.asInstanceOf[Array[T]] - } - } - /** Converts an array into a tile. */ private[rasterframes] def arrayToTile(cols: Int, rows: Int) = { safeEval[AnyRef, Tile]{ @@ -144,143 +66,9 @@ package object functions { } } - /** Computes the column aggregate histogram */ - private[rasterframes] val aggHistogram = HistogramAggregate() - - /** Computes the column aggregate statistics */ - private[rasterframes] val aggStats = CellStatsAggregate() - /** Set the tile's no-data value. */ private[rasterframes] def withNoData(nodata: Double) = safeEval[Tile, Tile](_.withNoData(Some(nodata))) - /** Single tile histogram. */ - private[rasterframes] val tileHistogram = safeEval[Tile, CellHistogram](t ⇒ CellHistogram(t.histogramDouble)) - - /** Single tile statistics. Convenience for `tile_histogram.statistics`. */ - private[rasterframes] val tileStats = safeEval[Tile, CellStatistics]((t: Tile) ⇒ - if (t.cellType.isFloatingPoint) t.statisticsDouble.map(CellStatistics.apply).orNull - else t.statistics.map(CellStatistics.apply).orNull - ) - - /** Add up all the cell values. */ - private[rasterframes] val tileSum: (Tile) ⇒ Double = safeEval((t: Tile) ⇒ { - var sum: Double = 0.0 - t.foreachDouble(z ⇒ if(isData(z)) sum = sum + z) - sum - }) - - /** Find the minimum cell value. */ - private[rasterframes] val tileMin: (Tile) ⇒ Double = safeEval((t: Tile) ⇒ { - var min: Double = Double.MaxValue - t.foreachDouble(z ⇒ if(isData(z)) min = math.min(min, z)) - if (min == Double.MaxValue) Double.NaN - else min - }) - - /** Find the maximum cell value. */ - private[rasterframes] val tileMax: (Tile) ⇒ Double = safeEval((t: Tile) ⇒ { - var max: Double = Double.MinValue - t.foreachDouble(z ⇒ if(isData(z)) max = math.max(max, z)) - if (max == Double.MinValue) Double.NaN - else max - }) - - /** Single tile mean. Convenience for `tile_histogram.statistics.mean`. */ - private[rasterframes] val tileMean: (Tile) ⇒ Double = safeEval((t: Tile) ⇒ { - var sum: Double = 0.0 - var count: Long = 0 - t.dualForeach( - z ⇒ if(isData(z)) { count = count + 1; sum = sum + z } - ) ( - z ⇒ if(isData(z)) { count = count + 1; sum = sum + z } - ) - sum/count - }) - - /** Compute summary cell-wise statistics across tiles. */ - private[rasterframes] val localAggStats = new LocalStatsAggregate() - - /** Compute the cell-wise max across tiles. */ - private[rasterframes] val localAggMax = new LocalTileOpAggregate(Max) - - /** Compute the cell-wise min across tiles. */ - private[rasterframes] val localAggMin = new LocalTileOpAggregate(Min) - - /** Compute the cell-wise main across tiles. */ - private[rasterframes] val localAggMean = new LocalMeanAggregate() - - /** Compute the cell-wise count of non-NA across tiles. */ - private[rasterframes] val localAggCount = new LocalCountAggregate(true) - - /** Compute the cell-wise count of non-NA across tiles. */ - private[rasterframes] val localAggNodataCount = new LocalCountAggregate(false) - - /** Convert the tile to a floating point type as needed for scalar operations. */ - @inline - private def floatingPointTile(t: Tile) = if (t.cellType.isFloatingPoint) t else t.convert(DoubleConstantNoDataCellType) - - /** Cell-wise addition between tiles. */ - private[rasterframes] val localAdd: (Tile, Tile) ⇒ Tile = safeEval(Add.apply) - - /** Cell-wise addition of a scalar to a tile. */ - private[rasterframes] val localAddScalarInt: (Tile, Int) ⇒ Tile = safeEval((t: Tile, scalar:Int) => { - t.localAdd(scalar) - }) - - /** Cell-wise addition of a scalar to a tile. */ - private[rasterframes] val localAddScalar: (Tile, Double) ⇒ Tile = safeEval((t: Tile, scalar:Double) => { - floatingPointTile(t).localAdd(scalar) - }) - - /** Cell-wise subtraction between tiles. */ - private[rasterframes] val localSubtract: (Tile, Tile) ⇒ Tile = safeEval(Subtract.apply) - - /** Cell-wise subtraction of a scalar from a tile. */ - private[rasterframes] val localSubtractScalarInt: (Tile, Int) ⇒ Tile = safeEval((t: Tile, scalar:Int) => { - t.localSubtract(scalar) - }) - - /** Cell-wise subtraction of a scalar from a tile. */ - private[rasterframes] val localSubtractScalar: (Tile, Double) ⇒ Tile = safeEval((t: Tile, scalar:Double) => { - floatingPointTile(t).localSubtract(scalar) - }) - - /** Cell-wise multiplication between tiles. */ - private[rasterframes] val localMultiply: (Tile, Tile) ⇒ Tile = safeEval(Multiply.apply) - - /** Cell-wise multiplication of a tile by a scalar. */ - private[rasterframes] val localMultiplyScalarInt: (Tile, Int) ⇒ Tile = safeEval((t: Tile, scalar:Int) => { - t.localMultiply(scalar) - }) - - /** Cell-wise multiplication of a tile by a scalar. */ - private[rasterframes] val localMultiplyScalar: (Tile, Double) ⇒ Tile = safeEval((t: Tile, scalar:Double) => { - floatingPointTile(t).localMultiply(scalar) - }) - - /** Cell-wise division between tiles. */ - private[rasterframes] val localDivide: (Tile, Tile) ⇒ Tile = safeEval(Divide.apply) - - /** Cell-wise division of a tile by a scalar. */ - private[rasterframes] val localDivideScalarInt: (Tile, Int) ⇒ Tile = safeEval((t: Tile, scalar:Int) => { - t.localDivide(scalar) - }) - - /** Cell-wise division of a tile by a scalar. */ - private[rasterframes] val localDivideScalar: (Tile, Double) ⇒ Tile = safeEval((t: Tile, scalar:Double) => { - floatingPointTile(t).localDivide(scalar) - }) - - /** Cell-wise normalized difference of tiles. */ - private[rasterframes] val normalizedDifference: (Tile, Tile) ⇒ Tile = safeEval((t1: Tile, t2:Tile) => { - val diff = floatingPointTile(Subtract(t1, t2)) - val sum = floatingPointTile(Add(t1, t2)) - Divide(diff, sum) - }) - - /** Render tile as ASCII string. */ - private[rasterframes] val renderAscii: (Tile) ⇒ String = safeEval(_.renderAscii(AsciiArtEncoder.Palette.NARROW)) - /** Constructor for constant tiles */ private[rasterframes] val makeConstantTile: (Number, Int, Int, String) ⇒ Tile = (value, cols, rows, cellTypeName) ⇒ { val cellType = CellType.fromName(cellTypeName) @@ -324,33 +112,11 @@ package object functions { DoubleConstantNoDataCellType ).map(_.toString).distinct - /** - * Generate a tile with the values from the data tile, but where cells in the - * masking tile contain NODATA, replace the data value with NODATA. - */ - private[rasterframes] val mask: (Tile, Tile) ⇒ Tile = - (dataTile, maskingTile) ⇒ Mask(dataTile, Defined(maskingTile), 0, NODATA) - - /** - * Generate a tile with the values from the data tile, but where cells in the - * masking tile contain the masking value, replace the data value with NODATA. - */ - private[rasterframes] val maskByValue: (Tile, Tile, Int) ⇒ Tile = - (dataTile, maskingTile, maskingValue) ⇒ - Mask(dataTile, maskingTile, maskingValue, NODATA) - - /** - * Generate a tile with the values from the data tile, but where cells in the - * masking tile DO NOT contain NODATA, replace the data value with NODATA. - */ - private[rasterframes] val inverseMask: (Tile, Tile) ⇒ Tile = - (dataTile, maskingTile) ⇒ InverseMask(dataTile, Defined(maskingTile), 0, NODATA) - /** * Rasterize geometry into tiles. */ private[rasterframes] val rasterize: (Geometry, Geometry, Int, Int, Int) ⇒ Tile = { - import geotrellis.vector.{Geometry ⇒ GTGeometry} + import geotrellis.vector.{Geometry => GTGeometry} (geom, bounds, value, cols, rows) ⇒ { // We have to do this because (as of spark 2.2.x) Encoder-only types // can't be used as UDF inputs. Only Spark-native types and UDTs. @@ -359,91 +125,6 @@ package object functions { } } - /** Cellwise less than value comparison between two tiles. */ - private[rasterframes] val localLess: (Tile, Tile) ⇒ Tile = safeEval(Less.apply) - - /** Cellwise less than value comparison between a tile and a scalar. */ - private[rasterframes] val localLessScalarInt: (Tile, Int) ⇒ Tile = safeEval((t: Tile, scalar: Int) ⇒ { - t.localLess(scalar) - }) - - /** Cellwise less than value comparison between a tile and a scalar. */ - private[rasterframes] val localLessScalar: (Tile, Double) ⇒ Tile = safeEval((t: Tile, scalar: Double) ⇒ { - floatingPointTile(t).localLess(scalar) - }) - - /** Cellwise less than or equal to value comparison between two tiles. */ - private[rasterframes] val localLessEqual: (Tile, Tile) ⇒ Tile = safeEval(LessOrEqual.apply) - - /** Cellwise less than or equal to value comparison between a tile and a scalar. */ - private[rasterframes] val localLessEqualScalarInt: (Tile, Int) ⇒ Tile = safeEval((t: Tile, scalar: Int) ⇒ { - t.localLessOrEqual(scalar) - }) - - /** Cellwise less than or equal to value comparison between a tile and a scalar. */ - private[rasterframes] val localLessEqualScalar: (Tile, Double) ⇒ Tile = safeEval((t: Tile, scalar: Double) ⇒ { - floatingPointTile(t).localLessOrEqual(scalar) - }) - - /** Cellwise greater than value comparison between two tiles. */ - private[rasterframes] val localGreater: (Tile, Tile) ⇒ Tile = safeEval(Less.apply) - - /** Cellwise greater than value comparison between a tile and a scalar. */ - private[rasterframes] val localGreaterScalarInt: (Tile, Int) ⇒ Tile = safeEval((t: Tile, scalar: Int) ⇒ { - t.localGreater(scalar) - }) - - /** Cellwise greater than value comparison between a tile and a scalar. */ - private[rasterframes] val localGreaterScalar: (Tile, Double) ⇒ Tile = safeEval((t: Tile, scalar: Double) ⇒ { - floatingPointTile(t).localGreater(scalar) - }) - - /** Cellwise greater than or equal to value comparison between two tiles. */ - private[rasterframes] val localGreaterEqual: (Tile, Tile) ⇒ Tile = safeEval(LessOrEqual.apply) - - /** Cellwise greater than or equal to value comparison between a tile and a scalar. */ - private[rasterframes] val localGreaterEqualScalarInt: (Tile, Int) ⇒ Tile = safeEval((t: Tile, scalar: Int) ⇒ { - t.localGreaterOrEqual(scalar) - }) - - /** Cellwise greater than or equal to value comparison between a tile and a scalar. */ - private[rasterframes] val localGreaterEqualScalar: (Tile, Double) ⇒ Tile = safeEval((t: Tile, scalar: Double) ⇒ { - floatingPointTile(t).localGreaterOrEqual(scalar) - }) - - /** Cellwise equal to value comparison between two tiles. */ - private[rasterframes] val localEqual: (Tile, Tile) ⇒ Tile = safeEval(Equal.apply) - - /** Cellwise equal to value comparison between a tile and a scalar. */ - private[rasterframes] val localEqualScalarInt: (Tile, Int) ⇒ Tile = safeEval((t: Tile, scalar: Int) ⇒ { - t.localEqual(scalar) - }) - - /** Cellwise equal to value comparison between a tile and a scalar. */ - private[rasterframes] val localEqualScalar: (Tile, Double) ⇒ Tile = safeEval((t: Tile, scalar: Double) ⇒ { - floatingPointTile(t).localEqual(scalar) - }) - - /** Cellwise inequality value comparison between two tiles. */ - private[rasterframes] val localUnequal: (Tile, Tile) ⇒ Tile = safeEval(Unequal.apply) - - /** Cellwise inequality value comparison between a tile and a scalar. */ - private[rasterframes] val localUnequalScalarInt: (Tile, Int) ⇒ Tile = safeEval((t: Tile, scalar: Int) ⇒ { - t.localUnequal(scalar) - }) - - /** Cellwise inequality value comparison between a tile and a scalar. */ - private[rasterframes] val localUnequalScalar: (Tile, Double) ⇒ Tile = safeEval((t: Tile, scalar: Double) ⇒ { - floatingPointTile(t).localUnequal(scalar) - }) - - /** Reporjects a geometry column from one CRS to another. */ - private[rasterframes] val reprojectGeometry: (Geometry, CRS, CRS) ⇒ Geometry = - (sourceGeom, src, dst) ⇒ { - val trans = new ReprojectionTransformer(src, dst) - trans.transform(sourceGeom) - } - /** Reporjects a geometry column from one CRS to another, where CRS are defined in Proj4 format. */ private[rasterframes] val reprojectGeometryCRSName: (Geometry, String, String) ⇒ Geometry = (sourceGeom, srcName, dstName) ⇒ { @@ -454,64 +135,14 @@ package object functions { } def register(sqlContext: SQLContext): Unit = { - sqlContext.udf.register("rf_mask", mask) - sqlContext.udf.register("rf_mask_by_value", maskByValue) - sqlContext.udf.register("rf_inverse_mask", inverseMask) + sqlContext.udf.register("rf_make_constant_tile", makeConstantTile) sqlContext.udf.register("rf_tile_zeros", tileZeros) sqlContext.udf.register("rf_tile_ones", tileOnes) - sqlContext.udf.register("rf_tile_to_array_int", tileToArray[Int]) - sqlContext.udf.register("rf_tile_to_array_double", tileToArray[Double]) - sqlContext.udf.register("rf_agg_histogram", aggHistogram) - sqlContext.udf.register("rf_agg_stats", aggStats) - sqlContext.udf.register("rf_tile_min", tileMin) - sqlContext.udf.register("rf_tile_max", tileMax) - sqlContext.udf.register("rf_tile_mean", tileMean) - sqlContext.udf.register("rf_tile_sum", tileSum) - sqlContext.udf.register("rf_tile_histogram", tileHistogram) - sqlContext.udf.register("rf_tile_stats", tileStats) - sqlContext.udf.register("rf_data_cells", dataCells) - sqlContext.udf.register("rf_no_data_cells", noDataCells) - sqlContext.udf.register("rf_is_no_data_tile", isNoDataTile) - sqlContext.udf.register("rf_local_agg_stats", localAggStats) - sqlContext.udf.register("rf_local_agg_max", localAggMax) - sqlContext.udf.register("rf_local_agg_min", localAggMin) - sqlContext.udf.register("rf_local_agg_mean", localAggMean) - sqlContext.udf.register("rf_local_agg_count", localAggCount) - sqlContext.udf.register("rf_local_add", localAdd) - sqlContext.udf.register("rf_local_add_scalar", localAddScalar) - sqlContext.udf.register("rf_local_add_scalar_int", localAddScalarInt) - sqlContext.udf.register("rf_local_subtract", localSubtract) - sqlContext.udf.register("rf_local_subtract_scalar", localSubtractScalar) - sqlContext.udf.register("rf_local_subtract_scalar_int", localSubtractScalarInt) - sqlContext.udf.register("rf_local_multiply", localMultiply) - sqlContext.udf.register("rf_local_multiply_scalar", localMultiplyScalar) - sqlContext.udf.register("rf_local_multiply_scalar_int", localMultiplyScalarInt) - sqlContext.udf.register("rf_local_divide", localDivide) - sqlContext.udf.register("rf_local_divide_scalar", localDivideScalar) - sqlContext.udf.register("rf_local_divide_scalar_int", localDivideScalarInt) - sqlContext.udf.register("rf_normalized_difference", normalizedDifference) + sqlContext.udf.register("rf_cell_types", cellTypes) - sqlContext.udf.register("rf_render_ascii", renderAscii) sqlContext.udf.register("rf_rasterize", rasterize) - sqlContext.udf.register("rf_less", localLess) - sqlContext.udf.register("rf_less_scalar", localLessScalar) - sqlContext.udf.register("rf_less_scalar_int", localLessScalarInt) - sqlContext.udf.register("rf_less_equal", localLessEqual) - sqlContext.udf.register("rf_less_equal_scalar", localLessEqualScalar) - sqlContext.udf.register("rf_less_equal_scalar_int", localLessEqualScalarInt) - sqlContext.udf.register("rf_greater", localGreater) - sqlContext.udf.register("rf_greater_scalar", localGreaterScalar) - sqlContext.udf.register("rf_greaterscalar_int", localGreaterScalarInt) - sqlContext.udf.register("rf_greater_equal", localGreaterEqual) - sqlContext.udf.register("rf_greater_equal_scalar", localGreaterEqualScalar) - sqlContext.udf.register("rf_greater_equal_scalar_int", localGreaterEqualScalarInt) - sqlContext.udf.register("rf_equal", localEqual) - sqlContext.udf.register("rf_equal_scalar", localEqualScalar) - sqlContext.udf.register("rf_equal_scalar_int", localEqualScalarInt) - sqlContext.udf.register("rf_unequal", localUnequal) - sqlContext.udf.register("rf_unequal_scalar", localUnequalScalar) - sqlContext.udf.register("rf_unequal_scalar_int", localUnequalScalarInt) + sqlContext.udf.register("rf_reproject_geometry", reprojectGeometryCRSName) } } diff --git a/core/src/main/scala/astraea/spark/rasterframes/jts/Implicits.scala b/core/src/main/scala/astraea/spark/rasterframes/jts/Implicits.scala index 03d74eed7..e257ebfa5 100644 --- a/core/src/main/scala/astraea/spark/rasterframes/jts/Implicits.scala +++ b/core/src/main/scala/astraea/spark/rasterframes/jts/Implicits.scala @@ -29,14 +29,13 @@ import geotrellis.vector.{Point ⇒ gtPoint} import org.apache.spark.sql.{Column, TypedColumn} import org.apache.spark.sql.functions._ import org.locationtech.geomesa.spark.jts.DataFrameFunctions.SpatialConstructors +import astraea.spark.rasterframes.encoders.StandardEncoders.PrimitiveEncoders._ /** * Extension methods on typed columns allowing for DSL-like queries over JTS types. * @since 1/10/18 */ trait Implicits extends SpatialConstructors { - import astraea.spark.rasterframes.encoders.SparkDefaultEncoders._ - implicit class ExtentColumnMethods[T <: Geometry](val self: TypedColumn[Any, T]) extends MethodExtensions[TypedColumn[Any, T]] { diff --git a/core/src/main/scala/astraea/spark/rasterframes/model/CellContext.scala b/core/src/main/scala/astraea/spark/rasterframes/model/CellContext.scala new file mode 100644 index 000000000..cac2903dd --- /dev/null +++ b/core/src/main/scala/astraea/spark/rasterframes/model/CellContext.scala @@ -0,0 +1,50 @@ +/* + * This software is licensed under the Apache 2 license, quoted below. + * + * Copyright 2019 Astraea, Inc. + * + * Licensed under the Apache License, Version 2.0 (the "License"); you may not + * use this file except in compliance with the License. You may obtain a copy of + * the License at + * + * [http://www.apache.org/licenses/LICENSE-2.0] + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT + * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the + * License for the specific language governing permissions and limitations under + * the License. + * + * SPDX-License-Identifier: Apache-2.0 + * + */ + +package astraea.spark.rasterframes.model +import astraea.spark.rasterframes.encoders.{CatalystSerializer, CatalystSerializerEncoder} +import org.apache.spark.sql.catalyst.encoders.ExpressionEncoder +import org.apache.spark.sql.types.{ShortType, StructField, StructType} + +case class CellContext(tile_context: TileContext, tile_data_context: TileDataContext, col_index: Short, row_index: Short) +object CellContext { + implicit val serializer: CatalystSerializer[CellContext] = new CatalystSerializer[CellContext] { + override def schema: StructType = StructType(Seq( + StructField("tile_context", CatalystSerializer[TileContext].schema, false), + StructField("tile_data_context", CatalystSerializer[TileDataContext].schema, false), + StructField("col_index", ShortType, false), + StructField("row_index", ShortType, false) + )) + override protected def to[R](t: CellContext, io: CatalystSerializer.CatalystIO[R]): R = io.create( + io.to(t.tile_context), + io.to(t.tile_data_context), + t.col_index, + t.row_index + ) + override protected def from[R](t: R, io: CatalystSerializer.CatalystIO[R]): CellContext = CellContext( + io.get[TileContext](t, 0), + io.get[TileDataContext](t, 1), + io.getShort(t, 2), + io.getShort(t, 3) + ) + } + implicit def encoder: ExpressionEncoder[CellContext] = CatalystSerializerEncoder[CellContext]() +} diff --git a/core/src/main/scala/astraea/spark/rasterframes/model/Cells.scala b/core/src/main/scala/astraea/spark/rasterframes/model/Cells.scala new file mode 100644 index 000000000..acf847e45 --- /dev/null +++ b/core/src/main/scala/astraea/spark/rasterframes/model/Cells.scala @@ -0,0 +1,72 @@ +/* + * This software is licensed under the Apache 2 license, quoted below. + * + * Copyright 2019 Astraea, Inc. + * + * Licensed under the Apache License, Version 2.0 (the "License"); you may not + * use this file except in compliance with the License. You may obtain a copy of + * the License at + * + * [http://www.apache.org/licenses/LICENSE-2.0] + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT + * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the + * License for the specific language governing permissions and limitations under + * the License. + * + * SPDX-License-Identifier: Apache-2.0 + * + */ + +package astraea.spark.rasterframes.model +import astraea.spark.rasterframes.encoders.{CatalystSerializer, CatalystSerializerEncoder} +import astraea.spark.rasterframes.ref.RasterRef +import astraea.spark.rasterframes.ref.RasterRef.RasterRefTile +import geotrellis.raster.{ArrayTile, Tile} +import org.apache.spark.sql.catalyst.encoders.ExpressionEncoder +import org.apache.spark.sql.types.{BinaryType, StructField, StructType} + +/** Represents the union of binary cell datas or a reference to the data.*/ +case class Cells(data: Either[Array[Byte], RasterRef]) { + def isRef: Boolean = data.isRight + /** Convert cells into either a RasterRefTile or an ArrayTile. */ + def toTile(ctx: TileDataContext): Tile = { + data.fold( + bytes => ArrayTile.fromBytes(bytes, ctx.cell_type, ctx.dimensions.cols, ctx.dimensions.rows), + ref => RasterRefTile(ref) + ) + } +} + +object Cells { + /** Extracts the Cells from a Tile. */ + def apply(t: Tile): Cells = { + t match { + case ref: RasterRefTile => + Cells(Right(ref.rr)) + case o => + Cells(Left(o.toBytes)) + } + } + + implicit def cellsSerializer: CatalystSerializer[Cells] = new CatalystSerializer[Cells] { + override def schema: StructType = StructType(Seq( + StructField("cells", BinaryType, true), + StructField("ref", CatalystSerializer[RasterRef].schema, true) + )) + override protected def to[R](t: Cells, io: CatalystSerializer.CatalystIO[R]): R = io.create( + t.data.left.getOrElse(null), + t.data.right.map(rr => io.to(rr)).right.getOrElse(null) + ) + override protected def from[R](t: R, io: CatalystSerializer.CatalystIO[R]): Cells = { + if (!io.isNullAt(t, 0)) + Cells(Left(io.getByteArray(t, 0))) + else if (!io.isNullAt(t, 1)) + Cells(Right(io.get[RasterRef](t, 1))) + else throw new IllegalArgumentException("must be eithe cell data or a ref, but not null") + } + } + + implicit def encoder: ExpressionEncoder[Cells] = CatalystSerializerEncoder[Cells]() +} diff --git a/core/src/main/scala/astraea/spark/rasterframes/model/TileContext.scala b/core/src/main/scala/astraea/spark/rasterframes/model/TileContext.scala new file mode 100644 index 000000000..f5d49524c --- /dev/null +++ b/core/src/main/scala/astraea/spark/rasterframes/model/TileContext.scala @@ -0,0 +1,55 @@ +/* + * This software is licensed under the Apache 2 license, quoted below. + * + * Copyright 2019 Astraea, Inc. + * + * Licensed under the Apache License, Version 2.0 (the "License"); you may not + * use this file except in compliance with the License. You may obtain a copy of + * the License at + * + * [http://www.apache.org/licenses/LICENSE-2.0] + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT + * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the + * License for the specific language governing permissions and limitations under + * the License. + * + * SPDX-License-Identifier: Apache-2.0 + * + */ + +package astraea.spark.rasterframes.model +import astraea.spark.rasterframes.encoders.{CatalystSerializer, CatalystSerializerEncoder} +import astraea.spark.rasterframes.tiles.ProjectedRasterTile +import geotrellis.proj4.CRS +import geotrellis.raster.Tile +import geotrellis.vector.Extent +import org.apache.spark.sql.catalyst.encoders.ExpressionEncoder +import org.apache.spark.sql.types.{StructField, StructType} + +case class TileContext(extent: Extent, crs: CRS) { + def toProjectRasterTile(t: Tile): ProjectedRasterTile = ProjectedRasterTile(t, extent, crs) +} +object TileContext { + def apply(prt: ProjectedRasterTile): TileContext = new TileContext(prt.extent, prt.crs) + def unapply(tile: Tile): Option[(Extent, CRS)] = tile match { + case prt: ProjectedRasterTile => Some((prt.extent, prt.crs)) + case _ => None + } + implicit val serializer: CatalystSerializer[TileContext] = new CatalystSerializer[TileContext] { + override def schema: StructType = StructType(Seq( + StructField("extent", CatalystSerializer[Extent].schema, false), + StructField("crs", CatalystSerializer[CRS].schema, false) + )) + override protected def to[R](t: TileContext, io: CatalystSerializer.CatalystIO[R]): R = io.create( + io.to(t.extent), + io.to(t.crs) + ) + override protected def from[R](t: R, io: CatalystSerializer.CatalystIO[R]): TileContext = TileContext( + io.get[Extent](t, 0), + io.get[CRS](t, 1) + ) + } + implicit def encoder: ExpressionEncoder[TileContext] = CatalystSerializerEncoder[TileContext]() +} diff --git a/core/src/main/scala/astraea/spark/rasterframes/model/TileDataContext.scala b/core/src/main/scala/astraea/spark/rasterframes/model/TileDataContext.scala new file mode 100644 index 000000000..121f8b845 --- /dev/null +++ b/core/src/main/scala/astraea/spark/rasterframes/model/TileDataContext.scala @@ -0,0 +1,59 @@ +/* + * This software is licensed under the Apache 2 license, quoted below. + * + * Copyright 2019 Astraea, Inc. + * + * Licensed under the Apache License, Version 2.0 (the "License"); you may not + * use this file except in compliance with the License. You may obtain a copy of + * the License at + * + * [http://www.apache.org/licenses/LICENSE-2.0] + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT + * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the + * License for the specific language governing permissions and limitations under + * the License. + * + * SPDX-License-Identifier: Apache-2.0 + * + */ + +package astraea.spark.rasterframes.model +import astraea.spark.rasterframes.encoders.{CatalystSerializer, CatalystSerializerEncoder} +import astraea.spark.rasterframes.encoders.CatalystSerializer._ +import geotrellis.raster.{CellType, Tile} +import org.apache.spark.sql.catalyst.encoders.ExpressionEncoder +import org.apache.spark.sql.types.{StructField, StructType} + +/** Encapsulates all information about a tile aside from actual cell values. */ +case class TileDataContext(cell_type: CellType, dimensions: TileDimensions) +object TileDataContext { + + /** Extracts the TileDataContext from a Tile. */ + def apply(t: Tile): TileDataContext = { + require(t.cols <= Short.MaxValue, s"RasterFrames doesn't support tiles of size ${t.cols}") + require(t.rows <= Short.MaxValue, s"RasterFrames doesn't support tiles of size ${t.rows}") + TileDataContext( + t.cellType, TileDimensions(t.dimensions) + ) + } + + implicit val serializer: CatalystSerializer[TileDataContext] = new CatalystSerializer[TileDataContext] { + override def schema: StructType = StructType(Seq( + StructField("cell_type", CatalystSerializer[CellType].schema, false), + StructField("dimensions", CatalystSerializer[TileDimensions].schema, false) + )) + + override protected def to[R](t: TileDataContext, io: CatalystIO[R]): R = io.create( + io.to(t.cell_type), + io.to(t.dimensions) + ) + override protected def from[R](t: R, io: CatalystIO[R]): TileDataContext = TileDataContext( + io.get[CellType](t, 0), + io.get[TileDimensions](t, 1) + ) + } + + implicit def encoder: ExpressionEncoder[TileDataContext] = CatalystSerializerEncoder[TileDataContext]() +} diff --git a/core/src/main/scala/astraea/spark/rasterframes/TileDimensions.scala b/core/src/main/scala/astraea/spark/rasterframes/model/TileDimensions.scala similarity index 78% rename from core/src/main/scala/astraea/spark/rasterframes/TileDimensions.scala rename to core/src/main/scala/astraea/spark/rasterframes/model/TileDimensions.scala index d850cabb4..2f7f579ba 100644 --- a/core/src/main/scala/astraea/spark/rasterframes/TileDimensions.scala +++ b/core/src/main/scala/astraea/spark/rasterframes/model/TileDimensions.scala @@ -1,7 +1,7 @@ /* * This software is licensed under the Apache 2 license, quoted below. * - * Copyright 2018 Astraea, Inc. + * Copyright 2019 Astraea, Inc. * * Licensed under the Apache License, Version 2.0 (the "License"); you may not * use this file except in compliance with the License. You may obtain a copy of @@ -19,11 +19,12 @@ * */ -package astraea.spark.rasterframes +package astraea.spark.rasterframes.model -import astraea.spark.rasterframes.encoders.CatalystSerializer import astraea.spark.rasterframes.encoders.CatalystSerializer.CatalystIO +import astraea.spark.rasterframes.encoders.{CatalystSerializer, CatalystSerializerEncoder} import geotrellis.raster.Grid +import org.apache.spark.sql.catalyst.encoders.ExpressionEncoder import org.apache.spark.sql.types.{ShortType, StructField, StructType} /** @@ -34,6 +35,8 @@ import org.apache.spark.sql.types.{ShortType, StructField, StructType} case class TileDimensions(cols: Int, rows: Int) extends Grid object TileDimensions { + def apply(colsRows: (Int, Int)): TileDimensions = new TileDimensions(colsRows._1, colsRows._2) + implicit val serializer: CatalystSerializer[TileDimensions] = new CatalystSerializer[TileDimensions] { override def schema: StructType = StructType(Seq( StructField("cols", ShortType, false), @@ -50,4 +53,6 @@ object TileDimensions { io.getShort(t, 1) ) } + + implicit def encoder: ExpressionEncoder[TileDimensions] = ExpressionEncoder[TileDimensions]() } diff --git a/core/src/main/scala/astraea/spark/rasterframes/package.scala b/core/src/main/scala/astraea/spark/rasterframes/package.scala index 54611eede..7b360ed25 100644 --- a/core/src/main/scala/astraea/spark/rasterframes/package.scala +++ b/core/src/main/scala/astraea/spark/rasterframes/package.scala @@ -18,12 +18,11 @@ package astraea.spark import astraea.spark.rasterframes.encoders.StandardEncoders import astraea.spark.rasterframes.util.ZeroSevenCompatibilityKit -import com.typesafe.config.{Config, ConfigFactory} +import com.typesafe.config.ConfigFactory import com.typesafe.scalalogging.LazyLogging import geotrellis.raster.{Tile, TileFeature} import geotrellis.spark.{ContextRDD, Metadata, SpaceTimeKey, SpatialKey, TileLayerMetadata} import org.apache.spark.rdd.RDD -import org.apache.spark.serializer.KryoSerializer import org.apache.spark.sql._ import org.locationtech.geomesa.spark.jts.DataFrameFunctions import shapeless.tag.@@ @@ -61,12 +60,12 @@ package object rasterframes extends StandardColumns val config = sqlContext.sparkSession.conf if(config.getOption("spark.serializer").isEmpty) { logger.warn("No serializer has been registered with Spark. Default Java serialization will be used, which is slow. " + - "Consider the following settings:" + + "Consider using the following settings:" + """ | SparkSession | .builder() - | .master("local[*]") - | .appName(getClass.getName) + | .master("...") + | .appName("...") | .withKryoSerialization // <--- RasterFrames extension method """.stripMargin diff --git a/core/src/main/scala/astraea/spark/rasterframes/ref/RasterRef.scala b/core/src/main/scala/astraea/spark/rasterframes/ref/RasterRef.scala index 52a8e0b2f..ff176765c 100644 --- a/core/src/main/scala/astraea/spark/rasterframes/ref/RasterRef.scala +++ b/core/src/main/scala/astraea/spark/rasterframes/ref/RasterRef.scala @@ -21,12 +21,17 @@ package astraea.spark.rasterframes.ref +import astraea.spark.rasterframes.encoders.{CatalystSerializer, CatalystSerializerEncoder} +import astraea.spark.rasterframes.encoders.CatalystSerializer.CatalystIO import astraea.spark.rasterframes.tiles.ProjectedRasterTile import com.typesafe.scalalogging.LazyLogging import geotrellis.proj4.CRS import geotrellis.raster.{CellType, GridBounds, Tile, TileLayout} import geotrellis.spark.tiling.LayoutDefinition import geotrellis.vector.{Extent, ProjectedExtent} +import org.apache.spark.sql.catalyst.encoders.ExpressionEncoder +import org.apache.spark.sql.rf.RasterSourceUDT +import org.apache.spark.sql.types.{StructField, StructType} /** * A delayed-read projected raster implementation. @@ -64,6 +69,8 @@ case class RasterRef(source: RasterSource, subextent: Option[Extent]) object RasterRef extends LazyLogging { private val log = logger + + /** Constructor for when data extent cover whole raster. */ def apply(source: RasterSource): RasterRef = RasterRef(source, None) @@ -86,4 +93,25 @@ object RasterRef extends LazyLogging { override def convert(ct: CellType): ProjectedRasterTile = ProjectedRasterTile(rr.realizedTile.convert(ct), extent, crs) } + + implicit val rasterRefSerializer: CatalystSerializer[RasterRef] = new CatalystSerializer[RasterRef] { + val rsType = new RasterSourceUDT() + override def schema: StructType = StructType(Seq( + StructField("source", rsType, false), + StructField("subextent", CatalystSerializer[Extent].schema, true) + )) + + override def to[R](t: RasterRef, io: CatalystIO[R]): R = io.create( + io.to(t.source)(RasterSourceUDT.rasterSourceSerializer), + t.subextent.map(io.to[Extent]).orNull + ) + + override def from[R](row: R, io: CatalystIO[R]): RasterRef = RasterRef( + io.get[RasterSource](row, 0)(RasterSourceUDT.rasterSourceSerializer), + if (io.isNullAt(row, 1)) None + else Option(io.get[Extent](row, 1)) + ) + } + + implicit def rrEncoder: ExpressionEncoder[RasterRef] = CatalystSerializerEncoder[RasterRef](true) } diff --git a/core/src/main/scala/astraea/spark/rasterframes/ref/RasterSource.scala b/core/src/main/scala/astraea/spark/rasterframes/ref/RasterSource.scala index 060c9f2e0..9dc9bd55e 100644 --- a/core/src/main/scala/astraea/spark/rasterframes/ref/RasterSource.scala +++ b/core/src/main/scala/astraea/spark/rasterframes/ref/RasterSource.scala @@ -24,15 +24,16 @@ import java.net.URI import java.time.ZonedDateTime import java.time.format.DateTimeFormatter +import astraea.spark.rasterframes.NOMINAL_TILE_SIZE +import astraea.spark.rasterframes.model.TileContext import astraea.spark.rasterframes.ref.RasterRef.RasterRefTile import astraea.spark.rasterframes.tiles.ProjectedRasterTile import astraea.spark.rasterframes.util.GeoTiffInfoSupport -import astraea.spark.rasterframes.NOMINAL_TILE_SIZE import com.typesafe.scalalogging.LazyLogging import geotrellis.proj4.CRS +import geotrellis.raster._ import geotrellis.raster.io.geotiff.reader.GeoTiffReader import geotrellis.raster.io.geotiff.{GeoTiffSegmentLayout, MultibandGeoTiff, SinglebandGeoTiff, Tags} -import geotrellis.raster._ import geotrellis.raster.split.Split import geotrellis.spark.io.hadoop.HdfsRangeReader import geotrellis.spark.io.s3.S3Client @@ -42,6 +43,9 @@ import geotrellis.util.{FileRangeReader, RangeReader} import geotrellis.vector.Extent import org.apache.hadoop.conf.Configuration import org.apache.hadoop.fs.Path +import org.apache.spark.annotation.Experimental +import org.apache.spark.sql.catalyst.encoders.ExpressionEncoder +import org.apache.spark.sql.rf.RasterSourceUDT import scala.util.Try @@ -50,6 +54,7 @@ import scala.util.Try * * @since 8/21/18 */ +@Experimental sealed trait RasterSource extends ProjectedRasterLike with Serializable { def crs: CRS @@ -61,9 +66,21 @@ sealed trait RasterSource extends ProjectedRasterLike with Serializable { def bandCount: Int + def tags: Option[Tags] + def read(extent: Extent): Either[Raster[Tile], Raster[MultibandTile]] + /** Reads the given extent as a single multiband raster. */ + def readMultiband(extent: Extent): Raster[MultibandTile] = + read(extent).fold(r => { + r.copy(tile = MultibandTile(r.tile)) + }, identity) + def readAll(): Either[Seq[Raster[Tile]], Seq[Raster[MultibandTile]]] + def readAllMultiband(): Seq[Raster[MultibandTile]] = + readAll().fold(_.map(r => { + r.copy(tile = MultibandTile(r.tile)) + }), identity) def readAllLazy(): Either[Seq[Raster[Tile]], Seq[Raster[MultibandTile]]] = { val extents = nativeTiling @@ -94,6 +111,8 @@ sealed trait RasterSource extends ProjectedRasterLike with Serializable { def gridExtent = GridExtent(extent, cellSize) + def tileContext: TileContext = TileContext(extent, crs) + def nativeTiling: Seq[Extent] = { nativeLayout.map { tileLayout ⇒ val layout = LayoutDefinition(extent, tileLayout) @@ -108,7 +127,10 @@ sealed trait RasterSource extends ProjectedRasterLike with Serializable { } object RasterSource extends LazyLogging { - + implicit def rsEncoder: ExpressionEncoder[RasterSource] = { + RasterSourceUDT // Makes sure UDT is registered first + ExpressionEncoder() + } private def _logger = logger @@ -168,6 +190,8 @@ object RasterSource extends LazyLogging { override def bandCount: Int = 1 + override def tags: Option[Tags] = None + override def read(extent: Extent): Either[Raster[Tile], Raster[MultibandTile]] = Left( Raster(tile.crop(rasterExtent.gridBoundsFor(extent, false)), extent) ) @@ -207,6 +231,8 @@ object RasterSource extends LazyLogging { def bandCount: Int = tiffInfo.bandCount + override def tags: Option[Tags] = Option(tiffInfo.tags) + def nativeLayout: Option[TileLayout] = { if (tiffInfo.segmentLayout.isTiled) Some(tiffInfo.segmentLayout.tileLayout) diff --git a/core/src/main/scala/astraea/spark/rasterframes/stats/CellHistogram.scala b/core/src/main/scala/astraea/spark/rasterframes/stats/CellHistogram.scala index 82644d974..efc4908db 100644 --- a/core/src/main/scala/astraea/spark/rasterframes/stats/CellHistogram.scala +++ b/core/src/main/scala/astraea/spark/rasterframes/stats/CellHistogram.scala @@ -19,7 +19,11 @@ */ package astraea.spark.rasterframes.stats -import geotrellis.raster.histogram.{StreamingHistogram, Histogram ⇒ GTHistogram} +import astraea.spark.rasterframes.encoders.StandardEncoders +import geotrellis.raster.Tile +import geotrellis.raster.histogram.{Histogram => GTHistogram} +import org.apache.spark.sql.types._ + import scala.collection.mutable.{ListBuffer => MutableListBuffer} /** @@ -27,12 +31,9 @@ import scala.collection.mutable.{ListBuffer => MutableListBuffer} * * @since 4/3/18 */ -case class CellHistogram(stats: CellStatistics, bins: Seq[CellHistogram.Bin]) { - - def labels = bins.map(_.value) - def mean = stats.mean - def totalCount = stats.dataCells - def asciiStats = stats.asciiStats +case class CellHistogram(bins: Seq[CellHistogram.Bin]) { + lazy val labels: Seq[Double] = bins.map(_.value) + lazy val totalCount = bins.foldLeft(0L)(_ + _.count) def asciiHistogram(width: Int = 80)= { val counts = bins.map(_.count) val maxCount = counts.max.toFloat @@ -68,7 +69,7 @@ case class CellHistogram(stats: CellStatistics, bins: Seq[CellHistogram.Bin]) { } } - private def cdfIntervals(): Iterator[((Double, Double), (Double, Double))] = { + private def cdfIntervals: Iterator[((Double, Double), (Double, Double))] = { if(bins.size < 2) { Iterator.empty } else { @@ -151,15 +152,25 @@ case class CellHistogram(stats: CellStatistics, bins: Seq[CellHistogram.Bin]) { object CellHistogram { case class Bin(value: Double, count: Long) + + def apply(tile: Tile): CellHistogram = { + val bins = if (tile.cellType.isFloatingPoint) { + val h = tile.histogramDouble + h.binCounts().map(p ⇒ Bin(p._1, p._2)) + } + else { + val h = tile.histogram + h.binCounts().map(p ⇒ Bin(p._1, p._2)) + } + CellHistogram(bins) + } + def apply(hist: GTHistogram[Int]): CellHistogram = { - val stats = CellStatistics(hist.statistics().get) - CellHistogram(stats, hist.binCounts().map(p ⇒ Bin(p._1.toDouble, p._2))) + CellHistogram(hist.binCounts().map(p ⇒ Bin(p._1, p._2))) } def apply(hist: GTHistogram[Double])(implicit ev: DummyImplicit): CellHistogram = { - val stats = hist.statistics().map(CellStatistics.apply).getOrElse(CellStatistics.empty) - // Code should be this, but can't due to geotrellis#2664: - // val bins = hist.binCounts().map(p ⇒ Bin(p._1, p._2)) - val bins = hist.asInstanceOf[StreamingHistogram].buckets().map(b ⇒ Bin(b.label, b.count)) - CellHistogram(stats, bins) + CellHistogram(hist.binCounts().map(p ⇒ Bin(p._1, p._2))) } + + lazy val schema: StructType = StandardEncoders.cellHistEncoder.schema } diff --git a/core/src/main/scala/astraea/spark/rasterframes/stats/CellStatistics.scala b/core/src/main/scala/astraea/spark/rasterframes/stats/CellStatistics.scala index ae9729a22..e1ba03b60 100644 --- a/core/src/main/scala/astraea/spark/rasterframes/stats/CellStatistics.scala +++ b/core/src/main/scala/astraea/spark/rasterframes/stats/CellStatistics.scala @@ -19,17 +19,20 @@ */ package astraea.spark.rasterframes.stats +import astraea.spark.rasterframes.encoders.StandardEncoders +import geotrellis.raster.Tile +import org.apache.spark.sql.types.StructType /** * Container for computed statistics over cells. * * @since 4/3/18 */ -case class CellStatistics(dataCells: Long, noDataCells: Long, min: Double, max: Double, mean: Double, variance: Double) { +case class CellStatistics(data_cells: Long, no_data_cells: Long, min: Double, max: Double, mean: Double, variance: Double) { def stddev: Double = math.sqrt(variance) def asciiStats = Seq( - "data_cells: " + dataCells, - "no_data_cells: " + noDataCells, + "data_cells: " + data_cells, + "no_data_cells: " + no_data_cells, "min: " + min, "max: " + max, "mean: " + mean, @@ -47,11 +50,21 @@ case class CellStatistics(dataCells: Long, noDataCells: Long, min: Double, max: } object CellStatistics { // Convert GeoTrellis stats object into our simplified one. + private[stats] def apply(stats: geotrellis.raster.summary.Statistics[Double]) = new CellStatistics(stats.dataCells, -1, stats.zmin, stats.zmax, stats.mean, stats.stddev * stats.stddev) + private[stats] def apply(stats: geotrellis.raster.summary.Statistics[Int])(implicit d: DummyImplicit) = new CellStatistics(stats.dataCells, -1, stats.zmin.toDouble, stats.zmax.toDouble, stats.mean, stats.stddev * stats.stddev) + def apply(tile: Tile): Option[CellStatistics] = { + val base = if (tile.cellType.isFloatingPoint) tile.statisticsDouble.map(CellStatistics.apply) + else tile.statistics.map(CellStatistics.apply) + base.map(s => s.copy(no_data_cells = tile.size - s.data_cells)) + } + def empty = new CellStatistics(0, 0, Double.NaN, Double.NaN, Double.NaN, Double.NaN) + + lazy val schema: StructType = StandardEncoders.cellStatsEncoder.schema } \ No newline at end of file diff --git a/core/src/main/scala/astraea/spark/rasterframes/stats/LocalCellStatistics.scala b/core/src/main/scala/astraea/spark/rasterframes/stats/LocalCellStatistics.scala new file mode 100644 index 000000000..685722f62 --- /dev/null +++ b/core/src/main/scala/astraea/spark/rasterframes/stats/LocalCellStatistics.scala @@ -0,0 +1,25 @@ +/* + * This software is licensed under the Apache 2 license, quoted below. + * + * Copyright 2019 Astraea, Inc. + * + * Licensed under the Apache License, Version 2.0 (the "License"); you may not + * use this file except in compliance with the License. You may obtain a copy of + * the License at + * + * [http://www.apache.org/licenses/LICENSE-2.0] + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT + * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the + * License for the specific language governing permissions and limitations under + * the License. + * + * SPDX-License-Identifier: Apache-2.0 + * + */ + +package astraea.spark.rasterframes.stats +import geotrellis.raster.Tile + +case class LocalCellStatistics(count: Tile, min: Tile, max: Tile, mean: Tile, variance: Tile) diff --git a/core/src/main/scala/astraea/spark/rasterframes/tiles/InternalRowTile.scala b/core/src/main/scala/astraea/spark/rasterframes/tiles/InternalRowTile.scala index e871da0c7..021f0946a 100644 --- a/core/src/main/scala/astraea/spark/rasterframes/tiles/InternalRowTile.scala +++ b/core/src/main/scala/astraea/spark/rasterframes/tiles/InternalRowTile.scala @@ -23,6 +23,8 @@ package astraea.spark.rasterframes.tiles import java.nio.ByteBuffer +import astraea.spark.rasterframes.encoders.CatalystSerializer.CatalystIO +import astraea.spark.rasterframes.model.{Cells, TileDataContext} import geotrellis.raster._ import org.apache.spark.sql.catalyst.InternalRow @@ -37,7 +39,6 @@ import org.apache.spark.sql.catalyst.InternalRow * @since 11/29/17 */ class InternalRowTile(val mem: InternalRow) extends DelegatingTile { - import org.apache.spark.sql.rf.TileUDT.C import InternalRowTile._ /** @group COPIES */ @@ -58,18 +59,30 @@ class InternalRowTile(val mem: InternalRow) extends DelegatingTile { /** @group COPIES */ protected override def delegate: Tile = realizedTile + private lazy val cellContext: TileDataContext = + CatalystIO[InternalRow].get[TileDataContext](mem, 0) + + /** Retrieve the cell type from the internal encoding. */ - override lazy val cellType: CellType = - CellType.fromName(mem.getString(C.CELL_TYPE)) + override def cellType: CellType = cellContext.cell_type /** Retrieve the number of columns from the internal encoding. */ - override val cols: Int = mem.getShort(C.COLS) + override def cols: Int = cellContext.dimensions.cols /** Retrieve the number of rows from the internal encoding. */ - override val rows: Int = mem.getShort(C.ROWS) + override def rows: Int = cellContext.dimensions.rows /** Get the internally encoded tile data cells. */ - override lazy val toBytes: Array[Byte] = mem.getBinary(C.CELLS) + override lazy val toBytes: Array[Byte] = { + val cellData = CatalystIO[InternalRow] + .get[Cells](mem, 1) + .data + + cellData.left + .getOrElse(throw new IllegalStateException( + "Expected tile cell bytes, but received RasterRef instead: " + cellData.right.get) + ) + } private lazy val toByteBuffer: ByteBuffer = { val data = toBytes diff --git a/core/src/main/scala/astraea/spark/rasterframes/tiles/ProjectedRasterTile.scala b/core/src/main/scala/astraea/spark/rasterframes/tiles/ProjectedRasterTile.scala index e8a18b432..a9551dd13 100644 --- a/core/src/main/scala/astraea/spark/rasterframes/tiles/ProjectedRasterTile.scala +++ b/core/src/main/scala/astraea/spark/rasterframes/tiles/ProjectedRasterTile.scala @@ -21,10 +21,19 @@ package astraea.spark.rasterframes.tiles +import astraea.spark.rasterframes.encoders.{CatalystSerializer, CatalystSerializerEncoder} +import astraea.spark.rasterframes.encoders.CatalystSerializer.CatalystIO +import astraea.spark.rasterframes.model.TileContext import astraea.spark.rasterframes.ref.ProjectedRasterLike +import astraea.spark.rasterframes.ref.RasterRef.RasterRefTile import geotrellis.proj4.CRS -import geotrellis.raster.{ProjectedRaster, Tile} +import geotrellis.raster.io.geotiff.SinglebandGeoTiff +import geotrellis.raster.{CellType, ProjectedRaster, Tile} import geotrellis.vector.{Extent, ProjectedExtent} +import org.apache.spark.sql.Encoder +import org.apache.spark.sql.catalyst.encoders.ExpressionEncoder +import org.apache.spark.sql.rf.TileUDT +import org.apache.spark.sql.types.{StructField, StructType} /** * A Tile that's also like a ProjectedRaster, with delayed evaluation support. @@ -39,10 +48,46 @@ trait ProjectedRasterTile extends DelegatingTile with ProjectedRasterLike { } object ProjectedRasterTile { - def apply(t: Tile, extent: Extent, crs: CRS): ProjectedRasterTile = ConcreteProjectedRasterTile(t, extent, crs) - def apply(pr: ProjectedRaster[Tile]): ProjectedRasterTile = ConcreteProjectedRasterTile(pr.tile, pr.extent, pr.crs) + def apply(t: Tile, extent: Extent, crs: CRS): ProjectedRasterTile = + ConcreteProjectedRasterTile(t, extent, crs) + def apply(pr: ProjectedRaster[Tile]): ProjectedRasterTile = + ConcreteProjectedRasterTile(pr.tile, pr.extent, pr.crs) + def apply(tiff: SinglebandGeoTiff): ProjectedRasterTile = + ConcreteProjectedRasterTile(tiff.tile, tiff.extent, tiff.crs) - case class ConcreteProjectedRasterTile(t: Tile, extent: Extent, crs: CRS) extends ProjectedRasterTile { + case class ConcreteProjectedRasterTile(t: Tile, extent: Extent, crs: CRS) + extends ProjectedRasterTile { def delegate: Tile = t + override def convert(cellType: CellType): Tile = + ConcreteProjectedRasterTile(t.convert(cellType), extent, crs) } + + implicit val serializer: CatalystSerializer[ProjectedRasterTile] = new CatalystSerializer[ProjectedRasterTile] { + val TileType = new TileUDT() + override def schema: StructType = StructType(Seq( + StructField("tile_context", CatalystSerializer[TileContext].schema, false), + StructField("tile", TileType, false)) + ) + + override protected def to[R](t: ProjectedRasterTile, io: CatalystIO[R]): R = io.create( + io.to(TileContext(t.extent, t.crs)), + io.to[Tile](t)(TileUDT.tileSerializer) + ) + + override protected def from[R](t: R, io: CatalystIO[R]): ProjectedRasterTile = { + val tile = io.get[Tile](t, 1)(TileUDT.tileSerializer) + tile match { + case r: RasterRefTile => r + case _ => + val ctx = io.get[TileContext](t, 0) + val resolved = tile match { + case i: InternalRowTile => i.toArrayTile() + case o => o + } + ProjectedRasterTile(resolved, ctx.extent, ctx.crs) + } + } + } + + implicit val prtEncoder: ExpressionEncoder[ProjectedRasterTile] = CatalystSerializerEncoder[ProjectedRasterTile](true) } diff --git a/core/src/main/scala/astraea/spark/rasterframes/functions/DataBiasedOp.scala b/core/src/main/scala/astraea/spark/rasterframes/util/DataBiasedOp.scala similarity index 94% rename from core/src/main/scala/astraea/spark/rasterframes/functions/DataBiasedOp.scala rename to core/src/main/scala/astraea/spark/rasterframes/util/DataBiasedOp.scala index ee384041b..c2e2578a3 100644 --- a/core/src/main/scala/astraea/spark/rasterframes/functions/DataBiasedOp.scala +++ b/core/src/main/scala/astraea/spark/rasterframes/util/DataBiasedOp.scala @@ -1,7 +1,7 @@ /* * This software is licensed under the Apache 2 license, quoted below. * - * Copyright 2017 Astraea, Inc. + * Copyright 2019 Astraea, Inc. * * Licensed under the Apache License, Version 2.0 (the "License"); you may not * use this file except in compliance with the License. You may obtain a copy of @@ -15,9 +15,11 @@ * License for the specific language governing permissions and limitations under * the License. * + * SPDX-License-Identifier: Apache-2.0 + * */ -package astraea.spark.rasterframes.functions +package astraea.spark.rasterframes.util import geotrellis.raster import geotrellis.raster.isNoData diff --git a/core/src/main/scala/astraea/spark/rasterframes/util/ZeroSevenCompatibilityKit.scala b/core/src/main/scala/astraea/spark/rasterframes/util/ZeroSevenCompatibilityKit.scala index 0f324bdd8..bbb23a282 100644 --- a/core/src/main/scala/astraea/spark/rasterframes/util/ZeroSevenCompatibilityKit.scala +++ b/core/src/main/scala/astraea/spark/rasterframes/util/ZeroSevenCompatibilityKit.scala @@ -20,10 +20,15 @@ */ package astraea.spark.rasterframes.util -import astraea.spark.rasterframes.encoders.SparkDefaultEncoders -import astraea.spark.rasterframes.functions.{CellCountAggregate, CellMeanAggregate} +import astraea.spark.rasterframes.expressions.TileAssembler +import astraea.spark.rasterframes.expressions.accessors._ +import astraea.spark.rasterframes.expressions.aggstats._ +import astraea.spark.rasterframes.expressions.generators._ +import astraea.spark.rasterframes.expressions.localops._ +import astraea.spark.rasterframes.expressions.tilestats._ +import astraea.spark.rasterframes.expressions.transformers._ import astraea.spark.rasterframes.stats.{CellHistogram, CellStatistics} -import astraea.spark.rasterframes.{HasCellType, util} +import astraea.spark.rasterframes.{functions => F} import com.vividsolutions.jts.geom.Geometry import geotrellis.proj4.CRS import geotrellis.raster.mapalgebra.local.LocalTileBinaryOp @@ -31,12 +36,8 @@ import geotrellis.raster.{CellType, Tile} import org.apache.spark.annotation.Experimental import org.apache.spark.sql.catalyst.analysis.FunctionRegistry import org.apache.spark.sql.functions.{lit, udf} -import org.apache.spark.sql.rf.VersionShims +import org.apache.spark.sql.rf.VersionShims._ import org.apache.spark.sql.{Column, SQLContext, TypedColumn, rf} -import astraea.spark.rasterframes.{expressions ⇒ E, functions ⇒ F} - -import scala.reflect.runtime.universe._ - /** * UDFs for working with Tiles in Spark DataFrames. @@ -44,514 +45,326 @@ import scala.reflect.runtime.universe._ * @since 4/3/17 */ object ZeroSevenCompatibilityKit { - import SparkDefaultEncoders._ - //import util.NamedColumn + import astraea.spark.rasterframes.encoders.StandardEncoders._ trait RasterFunctions { + private val delegate = new astraea.spark.rasterframes.RasterFunctions {} // format: off - /** Create a row for each cell in Tile. */ - @deprecated("Part of 0.7.x compatility kit, to be removed after 0.8.x. Please use \"snake_case\" variant instead.", "0.8.0") - def explodeTiles(cols: Column*): Column = explodeTilesSample(1.0, None, cols: _*) - - /** Create a row for each cell in Tile with random sampling and optional seed. */ - @deprecated("Part of 0.7.x compatility kit, to be removed after 0.8.x. Please use \"snake_case\" variant instead.", "0.8.0") - def explodeTilesSample(sampleFraction: Double, seed: Option[Long], cols: Column*): Column = - E.ExplodeTiles(sampleFraction, seed, cols) - - /** Create a row for each cell in Tile with random sampling (no seed). */ - @deprecated("Part of 0.7.x compatility kit, to be removed after 0.8.x. Please use \"snake_case\" variant instead.", "0.8.0") - def explodeTilesSample(sampleFraction: Double, cols: Column*): Column = - E.ExplodeTiles(sampleFraction, None, cols) - - /** Query the number of (cols, rows) in a Tile. */ - @deprecated("Part of 0.7.x compatility kit, to be removed after 0.8.x. Please use \"snake_case\" variant instead.", "0.8.0") - def tileDimensions(col: Column): Column = E.GetDimensions(col) - - /** Flattens Tile into an array. A numeric type parameter is required. */ - @Experimental - @deprecated("Part of 0.7.x compatility kit, to be removed after 0.8.x. Please use \"snake_case\" variant instead.", "0.8.0") - def tileToArray[T: HasCellType: TypeTag](col: Column): TypedColumn[Any, Array[T]] = withAlias("tileToArray", col)( - udf[Array[T], Tile](F.tileToArray).apply(col) - ).as[Array[T]] - - @Experimental - /** Convert array in `arrayCol` into a Tile of dimensions `cols` and `rows`*/ - @deprecated("Part of 0.7.x compatility kit, to be removed after 0.8.x. Please use \"snake_case\" variant instead.", "0.8.0") - def arrayToTile(arrayCol: Column, cols: Int, rows: Int) = withAlias("array_to_tile", arrayCol)( - udf[Tile, AnyRef](F.arrayToTile(cols, rows)).apply(arrayCol) - ) - - /** Create a Tile from a column of cell data with location indexes and preform cell conversion. */ - @deprecated("Part of 0.7.x compatility kit, to be removed after 0.8.x. Please use \"snake_case\" variant instead.", "0.8.0") - def assembleTile(columnIndex: Column, rowIndex: Column, cellData: Column, tileCols: Int, tileRows: Int, ct: CellType): TypedColumn[Any, Tile] = - convertCellType(F.TileAssembler(columnIndex, rowIndex, cellData, lit(tileCols), lit(tileRows)), ct).as(cellData.columnName).as[Tile] - - /** Create a Tile from a column of cell data with location indexes. */ - @deprecated("Part of 0.7.x compatility kit, to be removed after 0.8.x. Please use \"snake_case\" variant instead.", "0.8.0") - def assembleTile(columnIndex: Column, rowIndex: Column, cellData: Column, tileCols: Column, tileRows: Column): TypedColumn[Any, Tile] = - F.TileAssembler(columnIndex, rowIndex, cellData, tileCols, tileRows) - - /** Extract the Tile's cell type */ - @deprecated("Part of 0.7.x compatility kit, to be removed after 0.8.x. Please use \"snake_case\" variant instead.", "0.8.0") - def cellType(col: Column): TypedColumn[Any, CellType] = E.GetCellType(col) - - /** Change the Tile's cell type */ - @deprecated("Part of 0.7.x compatility kit, to be removed after 0.8.x. Please use \"snake_case\" variant instead.", "0.8.0") - def convertCellType(col: Column, cellType: CellType): TypedColumn[Any, Tile] = - E.SetCellType(col, cellType) - - /** Change the Tile's cell type */ - @deprecated("Part of 0.7.x compatility kit, to be removed after 0.8.x. Please use \"snake_case\" variant instead.", "0.8.0") - def convertCellType(col: Column, cellTypeName: String): TypedColumn[Any, Tile] = - E.SetCellType(col, cellTypeName) - - /** Convert a bounding box structure to a Geometry type. Intented to support multiple schemas. */ - @deprecated("Part of 0.7.x compatility kit, to be removed after 0.8.x. Please use \"snake_case\" variant instead.", "0.8.0") - def boundsGeometry(bounds: Column): TypedColumn[Any, Geometry] = E.BoundsToGeometry(bounds) - - /** Assign a `NoData` value to the Tiles. */ - @deprecated("Part of 0.7.x compatility kit, to be removed after 0.8.x. Please use \"snake_case\" variant instead.", "0.8.0") - def withNoData(col: Column, nodata: Double) = withAlias("withNoData", col)( - udf[Tile, Tile](F.withNoData(nodata)).apply(col) - ).as[Tile] - - /** Compute the full column aggregate floating point histogram. */ - @deprecated("Part of 0.7.x compatility kit, to be removed after 0.8.x. Please use \"snake_case\" variant instead.", "0.8.0") - def aggHistogram(col: Column): TypedColumn[Any, CellHistogram] = - withAlias("histogram", col)( - F.aggHistogram(col) - ).as[CellHistogram] - - /** Compute the full column aggregate floating point statistics. */ - @deprecated("Part of 0.7.x compatility kit, to be removed after 0.8.x. Please use \"snake_case\" variant instead.", "0.8.0") - def aggStats(col: Column): TypedColumn[Any, CellStatistics] = withAlias("aggStats", col)( - F.aggStats(col) - ).as[CellStatistics] - - /** Computes the column aggregate mean. */ - @deprecated("Part of 0.7.x compatility kit, to be removed after 0.8.x. Please use \"snake_case\" variant instead.", "0.8.0") - def aggMean(col: Column) = CellMeanAggregate(col) - - /** Computes the number of non-NoData cells in a column. */ - @deprecated("Part of 0.7.x compatility kit, to be removed after 0.8.x. Please use \"snake_case\" variant instead.", "0.8.0") - def aggDataCells(col: Column) = CellCountAggregate(true, col) - - /** Computes the number of NoData cells in a column. */ - @deprecated("Part of 0.7.x compatility kit, to be removed after 0.8.x. Please use \"snake_case\" variant instead.", "0.8.0") - def aggNoDataCells(col: Column) = CellCountAggregate(false, col) - - /** Compute the Tile-wise mean */ - @deprecated("Part of 0.7.x compatility kit, to be removed after 0.8.x. Please use \"snake_case\" variant instead.", "0.8.0") - def tileMean(col: Column): TypedColumn[Any, Double] = - withAlias("tileMean", col)( - udf[Double, Tile](F.tileMean).apply(col) - ).as[Double] - - /** Compute the Tile-wise sum */ - @deprecated("Part of 0.7.x compatility kit, to be removed after 0.8.x. Please use \"snake_case\" variant instead.", "0.8.0") - def tileSum(col: Column): TypedColumn[Any, Double] = - withAlias("tileSum", col)( - udf[Double, Tile](F.tileSum).apply(col) - ).as[Double] - - /** Compute the minimum cell value in tile. */ - @deprecated("Part of 0.7.x compatility kit, to be removed after 0.8.x. Please use \"snake_case\" variant instead.", "0.8.0") - def tileMin(col: Column): TypedColumn[Any, Double] = - withAlias("tileMin", col)( - udf[Double, Tile](F.tileMin).apply(col) - ).as[Double] - - /** Compute the maximum cell value in tile. */ - @deprecated("Part of 0.7.x compatility kit, to be removed after 0.8.x. Please use \"snake_case\" variant instead.", "0.8.0") - def tileMax(col: Column): TypedColumn[Any, Double] = - withAlias("tileMax", col)( - udf[Double, Tile](F.tileMax).apply(col) - ).as[Double] - - /** Compute TileHistogram of Tile values. */ - @deprecated("Part of 0.7.x compatility kit, to be removed after 0.8.x. Please use \"snake_case\" variant instead.", "0.8.0") - def tileHistogram(col: Column): TypedColumn[Any, CellHistogram] = - withAlias("tileHistogram", col)( - udf[CellHistogram, Tile](F.tileHistogram).apply(col) - ).as[CellHistogram] - - /** Compute statistics of Tile values. */ - @deprecated("Part of 0.7.x compatility kit, to be removed after 0.8.x. Please use \"snake_case\" variant instead.", "0.8.0") - def tileStats(col: Column): TypedColumn[Any, CellStatistics] = - withAlias("tileStats", col)( - udf[CellStatistics, Tile](F.tileStats).apply(col) - ).as[CellStatistics] - - /** Counts the number of non-NoData cells per Tile. */ - @deprecated("Part of 0.7.x compatility kit, to be removed after 0.8.x. Please use \"snake_case\" variant instead.", "0.8.0") - def dataCells(tile: Column): TypedColumn[Any, Long] = - withAlias("dataCells", tile)( - udf(F.dataCells).apply(tile) - ).as[Long] - - /** Counts the number of NoData cells per Tile. */ - @deprecated("Part of 0.7.x compatility kit, to be removed after 0.8.x. Please use \"snake_case\" variant instead.", "0.8.0") - def noDataCells(tile: Column): TypedColumn[Any, Long] = - withAlias("noDataCells", tile)( - udf(F.noDataCells).apply(tile) - ).as[Long] - - - @deprecated("Part of 0.7.x compatility kit, to be removed after 0.8.x. Please use \"snake_case\" variant instead.", "0.8.0") - def isNoDataTile(tile: Column): TypedColumn[Any, Boolean] = - withAlias("isNoDataTile", tile)( - udf(F.isNoDataTile).apply(tile) - ).as[Boolean] - - /** Compute cell-local aggregate descriptive statistics for a column of Tiles. */ - @deprecated("Part of 0.7.x compatility kit, to be removed after 0.8.x. Please use \"snake_case\" variant instead.", "0.8.0") - def localAggStats(col: Column): Column = - withAlias("localAggStats", col)( - F.localAggStats(col) - ) - - /** Compute the cell-wise/local max operation between Tiles in a column. */ - @deprecated("Part of 0.7.x compatility kit, to be removed after 0.8.x. Please use \"snake_case\" variant instead.", "0.8.0") - def localAggMax(col: Column): TypedColumn[Any, Tile] = - withAlias("localAggMax", col)( - F.localAggMax(col) - ).as[Tile] - - /** Compute the cellwise/local min operation between Tiles in a column. */ - @deprecated("Part of 0.7.x compatility kit, to be removed after 0.8.x. Please use \"snake_case\" variant instead.", "0.8.0") - def localAggMin(col: Column): TypedColumn[Any, Tile] = - withAlias("localAggMin", col)( - F.localAggMin(col) - ).as[Tile] - - /** Compute the cellwise/local mean operation between Tiles in a column. */ - @deprecated("Part of 0.7.x compatility kit, to be removed after 0.8.x. Please use \"snake_case\" variant instead.", "0.8.0") - def localAggMean(col: Column): TypedColumn[Any, Tile] = - withAlias("localAggMean", col)( - F.localAggMean(col) - ).as[Tile] - - /** Compute the cellwise/local count of non-NoData cells for all Tiles in a column. */ - @deprecated("Part of 0.7.x compatility kit, to be removed after 0.8.x. Please use \"snake_case\" variant instead.", "0.8.0") - def localAggDataCells(col: Column): TypedColumn[Any, Tile] = - withAlias("localCount", col)( - F.localAggCount(col) - ).as[Tile] - - /** Compute the cellwise/local count of NoData cells for all Tiles in a column. */ - @deprecated("Part of 0.7.x compatility kit, to be removed after 0.8.x. Please use \"snake_case\" variant instead.", "0.8.0") - def localAggNoDataCells(col: Column): TypedColumn[Any, Tile] = - withAlias("localNodataCount", col)( - F.localAggNodataCount(col) - ).as[Tile] - - /** Cellwise addition between two Tiles. */ - @deprecated("Part of 0.7.x compatility kit, to be removed after 0.8.x. Please use \"snake_case\" variant instead.", "0.8.0") - def localAdd(left: Column, right: Column): TypedColumn[Any, Tile] = - withAlias("local_add", left, right)( - udf(F.localAdd).apply(left, right) - ).as[Tile] - - /** Cellwise addition of a scalar to a tile. */ - @deprecated("Part of 0.7.x compatility kit, to be removed after 0.8.x. Please use \"snake_case\" variant instead.", "0.8.0") - def localAddScalar[T: Numeric](tileCol: Column, value: T): TypedColumn[Any, Tile] = { - val f = value match { - case i: Int => F.localAddScalarInt(_: Tile, i) - case d: Double => F.localAddScalar(_: Tile, d) - } - - udf(f).apply(tileCol).as(s"local_add_scalar($tileCol, $value)").as[Tile] - } - - /** Cellwise subtraction between two Tiles. */ - @deprecated("Part of 0.7.x compatility kit, to be removed after 0.8.x. Please use \"snake_case\" variant instead.", "0.8.0") - def localSubtract(left: Column, right: Column): TypedColumn[Any, Tile] = - withAlias("localSubtract", left, right)( - udf(F.localSubtract).apply(left, right) - ).as[Tile] - - /** Cellwise subtraction of a scalar from a tile. */ - @deprecated("Part of 0.7.x compatility kit, to be removed after 0.8.x. Please use \"snake_case\" variant instead.", "0.8.0") - def localSubtractScalar[T: Numeric](tileCol: Column, value: T): TypedColumn[Any, Tile] = { - val f = value match { - case i: Int => F.localSubtractScalarInt(_: Tile, i) - case d: Double => F.localSubtractScalar(_: Tile, d) - } - - udf(f).apply(tileCol).as(s"localSubtractScalar($tileCol, $value)").as[Tile] - } - - /** Cellwise multiplication between two Tiles. */ - @deprecated("Part of 0.7.x compatility kit, to be removed after 0.8.x. Please use \"snake_case\" variant instead.", "0.8.0") - def localMultiply(left: Column, right: Column): TypedColumn[Any, Tile] = - withAlias("localMultiply", left, right)( - udf(F.localMultiply).apply(left, right) - ).as[Tile] - - /** Cellwise multiplication of a tile by a scalar. */ - @deprecated("Part of 0.7.x compatility kit, to be removed after 0.8.x. Please use \"snake_case\" variant instead.", "0.8.0") - def localMultiplyScalar[T: Numeric](tileCol: Column, value: T): TypedColumn[Any, Tile] = { - val f = value match { - case i: Int => F.localMultiplyScalarInt(_: Tile, i) - case d: Double => F.localMultiplyScalar(_: Tile, d) - } - - udf(f).apply(tileCol).as(s"localMultiplyScalar($tileCol, $value)").as[Tile] - } - - /** Cellwise division between two Tiles. */ - @deprecated("Part of 0.7.x compatility kit, to be removed after 0.8.x. Please use \"snake_case\" variant instead.", "0.8.0") - def localDivide(left: Column, right: Column): TypedColumn[Any, Tile] = - withAlias("localDivide", left, right)( - udf(F.localDivide).apply(left, right) - ).as[Tile] - - /** Cellwise division of a tile by a scalar. */ - @deprecated("Part of 0.7.x compatility kit, to be removed after 0.8.x. Please use \"snake_case\" variant instead.", "0.8.0") - def localDivideScalar[T: Numeric](tileCol: Column, value: T): TypedColumn[Any, Tile] = { - val f = value match { - case i: Int => F.localDivideScalarInt(_: Tile, i) - case d: Double => F.localDivideScalar(_: Tile, d) - } - - udf(f).apply(tileCol).as(s"localDivideScalar($tileCol, $value)").as[Tile] - } - - /** Perform an arbitrary GeoTrellis `LocalTileBinaryOp` between two Tile columns. */ - @deprecated("Part of 0.7.x compatility kit, to be removed after 0.8.x. Please use \"snake_case\" variant instead.", "0.8.0") - def localAlgebra(op: LocalTileBinaryOp, left: Column, right: Column): - TypedColumn[Any, Tile] = - withAlias(opName(op), left, right)( - udf[Tile, Tile, Tile](op.apply).apply(left, right) - ).as[Tile] - - /** Compute the normalized difference of two tile columns */ - @deprecated("Part of 0.7.x compatility kit, to be removed after 0.8.x. Please use \"snake_case\" variant instead.", "0.8.0") - def normalizedDifference(left: Column, right: Column): TypedColumn[Any, Tile] = - withAlias("normalizedDifference", left, right)( - udf(F.normalizedDifference).apply(left, right) - ).as[Tile] - - /** Constructor for constant tile column */ - @deprecated("Part of 0.7.x compatility kit, to be removed after 0.8.x. Please use \"snake_case\" variant instead.", "0.8.0") - def makeConstantTile(value: Number, cols: Int, rows: Int, cellType: String): TypedColumn[Any, Tile] = - udf(() => F.makeConstantTile(value, cols, rows, cellType)).apply().as(s"constant_$cellType").as[Tile] - - /** Alias for column of constant tiles of zero */ - @deprecated("Part of 0.7.x compatility kit, to be removed after 0.8.x. Please use \"snake_case\" variant instead.", "0.8.0") - def tileZeros(cols: Int, rows: Int, cellType: String = "float64"): TypedColumn[Any, Tile] = - udf(() => F.tileZeros(cols, rows, cellType)).apply().as(s"zeros_$cellType").as[Tile] - - /** Alias for column of constant tiles of one */ - @deprecated("Part of 0.7.x compatility kit, to be removed after 0.8.x. Please use \"snake_case\" variant instead.", "0.8.0") - def tileOnes(cols: Int, rows: Int, cellType: String = "float64"): TypedColumn[Any, Tile] = - udf(() => F.tileOnes(cols, rows, cellType)).apply().as(s"ones_$cellType").as[Tile] - - /** Where the mask tile equals the mask value, replace values in the source tile with NODATA */ - @deprecated("Part of 0.7.x compatility kit, to be removed after 0.8.x. Please use \"snake_case\" variant instead.", "0.8.0") - def maskByValue(sourceTile: Column, maskTile: Column, maskValue: Column): TypedColumn[Any, Tile] = - withAlias("maskByValue", sourceTile, maskTile, maskValue)( - udf(F.maskByValue).apply(sourceTile, maskTile, maskValue) - ).as[Tile] - - /** Where the mask tile DOES NOT contain NODATA, replace values in the source tile with NODATA */ - @deprecated("Part of 0.7.x compatility kit, to be removed after 0.8.x. Please use \"snake_case\" variant instead.", "0.8.0") - def inverseMask(sourceTile: Column, maskTile: Column): TypedColumn[Any, Tile] = - withAlias("inverseMask", sourceTile, maskTile)( - udf(F.inverseMask).apply(sourceTile, maskTile) - ).as[Tile] - - /** Reproject a column of geometry from one CRS to another. */ - @deprecated("Part of 0.7.x compatility kit, to be removed after 0.8.x. Please use \"snake_case\" variant instead.", "0.8.0") - def reprojectGeometry(sourceGeom: Column, srcCRS: CRS, dstCRS: CRS): TypedColumn[Any, Geometry] = - withAlias("reprojectGeometry", sourceGeom)( - udf(F.reprojectGeometry(_: Geometry, srcCRS, dstCRS)).apply(sourceGeom) - ).as[Geometry] - - /** Render Tile as ASCII string for debugging purposes. */ - @Experimental - @deprecated("Part of 0.7.x compatility kit, to be removed after 0.8.x. Please use \"snake_case\" variant instead.", "0.8.0") - def renderAscii(col: Column): TypedColumn[Any, String] = - withAlias("renderAscii", col)( - udf[String, Tile](F.renderAscii).apply(col) - ).as[String] - - /** Cellwise less than value comparison between two tiles. */ - @deprecated("Part of 0.7.x compatility kit, to be removed after 0.8.x. Please use \"snake_case\" variant instead.", "0.8.0") - def localLess(left: Column, right: Column): TypedColumn[Any, Tile] = - withAlias("localLess", left, right)( - udf(F.localLess).apply(left, right) - ).as[Tile] - - - /** Cellwise less than value comparison between a tile and a scalar. */ - @deprecated("Part of 0.7.x compatility kit, to be removed after 0.8.x. Please use \"snake_case\" variant instead.", "0.8.0") - def localLessScalar[T: Numeric](tileCol: Column, value: T): TypedColumn[Any, Tile] = { - val f = value match{ - case i: Int => F.localLessScalarInt(_: Tile, i) - case d: Double => F.localLessScalar(_: Tile, d) - } - udf(f).apply(tileCol).as(s"localLessScalar($tileCol, $value)").as[Tile] - } - - /** Cellwise less than or equal to value comparison between a tile and a scalar. */ - @deprecated("Part of 0.7.x compatility kit, to be removed after 0.8.x. Please use \"snake_case\" variant instead.", "0.8.0") - def localLessEqual(left: Column, right: Column): TypedColumn[Any, Tile] = - withAlias("localLessEqual", left, right)( - udf(F.localLess).apply(left, right) + /** Create a row for each cell in Tile. */ + @deprecated("Part of 0.7.x compatility kit, to be removed after 0.8.x. Please use \"snake_case\" variant instead.", "0.8.0") + def explodeTiles(cols: Column*): Column = delegate.explode_tiles(cols: _*) + + /** Create a row for each cell in Tile with random sampling and optional seed. */ + @deprecated("Part of 0.7.x compatility kit, to be removed after 0.8.x. Please use \"snake_case\" variant instead.", "0.8.0") + def explodeTilesSample(sampleFraction: Double, seed: Option[Long], cols: Column*): Column = + ExplodeTiles(sampleFraction, seed, cols) + + /** Create a row for each cell in Tile with random sampling (no seed). */ + @deprecated("Part of 0.7.x compatility kit, to be removed after 0.8.x. Please use \"snake_case\" variant instead.", "0.8.0") + def explodeTilesSample(sampleFraction: Double, cols: Column*): Column = + ExplodeTiles(sampleFraction, None, cols) + + /** Query the number of (cols, rows) in a Tile. */ + @deprecated("Part of 0.7.x compatility kit, to be removed after 0.8.x. Please use \"snake_case\" variant instead.", "0.8.0") + def tileDimensions(col: Column): Column = GetDimensions(col) + + @Experimental + /** Convert array in `arrayCol` into a Tile of dimensions `cols` and `rows`*/ + @deprecated("Part of 0.7.x compatility kit, to be removed after 0.8.x. Please use \"snake_case\" variant instead.", "0.8.0") + def arrayToTile(arrayCol: Column, cols: Int, rows: Int) = withAlias("array_to_tile", arrayCol)( + udf[Tile, AnyRef](F.arrayToTile(cols, rows)).apply(arrayCol) + ) + + /** Create a Tile from a column of cell data with location indexes and preform cell conversion. */ + @deprecated("Part of 0.7.x compatility kit, to be removed after 0.8.x. Please use \"snake_case\" variant instead.", "0.8.0") + def assembleTile(columnIndex: Column, rowIndex: Column, cellData: Column, tileCols: Int, tileRows: Int, ct: CellType): TypedColumn[Any, Tile] = + convertCellType(TileAssembler(columnIndex, rowIndex, cellData, lit(tileCols), lit(tileRows)), ct).as(cellData.columnName).as[Tile] + + /** Create a Tile from a column of cell data with location indexes. */ + @deprecated("Part of 0.7.x compatility kit, to be removed after 0.8.x. Please use \"snake_case\" variant instead.", "0.8.0") + def assembleTile(columnIndex: Column, rowIndex: Column, cellData: Column, tileCols: Column, tileRows: Column): TypedColumn[Any, Tile] = + TileAssembler(columnIndex, rowIndex, cellData, tileCols, tileRows) + + /** Extract the Tile's cell type */ + @deprecated("Part of 0.7.x compatility kit, to be removed after 0.8.x. Please use \"snake_case\" variant instead.", "0.8.0") + def cellType(col: Column): TypedColumn[Any, CellType] = GetCellType(col) + + /** Change the Tile's cell type */ + @deprecated("Part of 0.7.x compatility kit, to be removed after 0.8.x. Please use \"snake_case\" variant instead.", "0.8.0") + def convertCellType(col: Column, cellType: CellType): TypedColumn[Any, Tile] = + SetCellType(col, cellType) + + /** Change the Tile's cell type */ + @deprecated("Part of 0.7.x compatility kit, to be removed after 0.8.x. Please use \"snake_case\" variant instead.", "0.8.0") + def convertCellType(col: Column, cellTypeName: String): TypedColumn[Any, Tile] = + SetCellType(col, cellTypeName) + + /** Convert a bounding box structure to a Geometry type. Intented to support multiple schemas. */ + @deprecated("Part of 0.7.x compatility kit, to be removed after 0.8.x. Please use \"snake_case\" variant instead.", "0.8.0") + def boundsGeometry(bounds: Column): TypedColumn[Any, Geometry] = BoundsToGeometry(bounds) + + /** Assign a `NoData` value to the Tiles. */ + @deprecated("Part of 0.7.x compatility kit, to be removed after 0.8.x. Please use \"snake_case\" variant instead.", "0.8.0") + def withNoData(col: Column, nodata: Double) = withAlias("withNoData", col)( + udf[Tile, Tile](F.withNoData(nodata)).apply(col) ).as[Tile] - /** Cellwise less than or equal to value comparison between a tile and a scalar. */ - @deprecated("Part of 0.7.x compatility kit, to be removed after 0.8.x. Please use \"snake_case\" variant instead.", "0.8.0") - def localLessEqualScalar[T: Numeric](tileCol: Column, value: T): TypedColumn[Any, Tile] = { - val f = value match{ - case i: Int => F.localLessEqualScalarInt(_: Tile, i) - case d: Double => F.localLessEqualScalar(_: Tile, d) - } - udf(f).apply(tileCol).as(s"localLessEqualScalar($tileCol, $value)").as[Tile] - } - - /** Cellwise greater than value comparison between two tiles. */ - @deprecated("Part of 0.7.x compatility kit, to be removed after 0.8.x. Please use \"snake_case\" variant instead.", "0.8.0") - def localGreater(left: Column, right: Column): TypedColumn[Any, Tile] = - withAlias("localGreater", left, right)( - udf(F.localGreater).apply(left, right) - ).as[Tile] + /** Compute the full column aggregate floating point histogram. */ + @deprecated("Part of 0.7.x compatility kit, to be removed after 0.8.x. Please use \"snake_case\" variant instead.", "0.8.0") + def aggHistogram(col: Column): TypedColumn[Any, CellHistogram] = delegate.agg_approx_histogram(col) + /** Compute the full column aggregate floating point statistics. */ + @deprecated("Part of 0.7.x compatility kit, to be removed after 0.8.x. Please use \"snake_case\" variant instead.", "0.8.0") + def aggStats(col: Column): TypedColumn[Any, CellStatistics] = delegate.agg_stats(col) - /** Cellwise greater than value comparison between a tile and a scalar. */ - @deprecated("Part of 0.7.x compatility kit, to be removed after 0.8.x. Please use \"snake_case\" variant instead.", "0.8.0") - def localGreaterScalar[T: Numeric](tileCol: Column, value: T): TypedColumn[Any, Tile] = { - val f = value match{ - case i: Int => F.localGreaterScalarInt(_: Tile, i) - case d: Double => F.localGreaterScalar(_: Tile, d) - } - udf(f).apply(tileCol).as(s"localGreaterScalar($tileCol, $value)").as[Tile] + /** Computes the column aggregate mean. */ + @deprecated("Part of 0.7.x compatility kit, to be removed after 0.8.x. Please use \"snake_case\" variant instead.", "0.8.0") + def aggMean(col: Column) = CellMeanAggregate(col) + + /** Computes the number of non-NoData cells in a column. */ + @deprecated("Part of 0.7.x compatility kit, to be removed after 0.8.x. Please use \"snake_case\" variant instead.", "0.8.0") + def aggDataCells(col: Column): TypedColumn[Any, Long] = delegate.agg_data_cells(col) + + /** Computes the number of NoData cells in a column. */ + @deprecated("Part of 0.7.x compatility kit, to be removed after 0.8.x. Please use \"snake_case\" variant instead.", "0.8.0") + def aggNoDataCells(col: Column): TypedColumn[Any, Long] = delegate.agg_no_data_cells(col) + + /** Compute the Tile-wise mean */ + @deprecated("Part of 0.7.x compatility kit, to be removed after 0.8.x. Please use \"snake_case\" variant instead.", "0.8.0") + def tileMean(col: Column): TypedColumn[Any, Double] = delegate.tile_mean(col) + + /** Compute the Tile-wise sum */ + @deprecated("Part of 0.7.x compatility kit, to be removed after 0.8.x. Please use \"snake_case\" variant instead.", "0.8.0") + def tileSum(col: Column): TypedColumn[Any, Double] = delegate.tile_sum(col) + + /** Compute the minimum cell value in tile. */ + @deprecated("Part of 0.7.x compatility kit, to be removed after 0.8.x. Please use \"snake_case\" variant instead.", "0.8.0") + def tileMin(col: Column): TypedColumn[Any, Double] = delegate.tile_min(col) + + /** Compute the maximum cell value in tile. */ + @deprecated("Part of 0.7.x compatility kit, to be removed after 0.8.x. Please use \"snake_case\" variant instead.", "0.8.0") + def tileMax(col: Column): TypedColumn[Any, Double] = delegate.tile_max(col) + + /** Compute TileHistogram of Tile values. */ + @deprecated("Part of 0.7.x compatility kit, to be removed after 0.8.x. Please use \"snake_case\" variant instead.", "0.8.0") + def tileHistogram(col: Column): TypedColumn[Any, CellHistogram] = delegate.tile_histogram(col) + + /** Compute statistics of Tile values. */ + @deprecated("Part of 0.7.x compatility kit, to be removed after 0.8.x. Please use \"snake_case\" variant instead.", "0.8.0") + def tileStats(col: Column): TypedColumn[Any, CellStatistics] = delegate.tile_stats(col) + + /** Counts the number of non-NoData cells per Tile. */ + @deprecated("Part of 0.7.x compatility kit, to be removed after 0.8.x. Please use \"snake_case\" variant instead.", "0.8.0") + def dataCells(tile: Column): TypedColumn[Any, Long] = delegate.data_cells(tile) + + /** Counts the number of NoData cells per Tile. */ + @deprecated("Part of 0.7.x compatility kit, to be removed after 0.8.x. Please use \"snake_case\" variant instead.", "0.8.0") + def noDataCells(tile: Column): TypedColumn[Any, Long] = delegate.no_data_cells(tile) + + @deprecated("Part of 0.7.x compatility kit, to be removed after 0.8.x. Please use \"snake_case\" variant instead.", "0.8.0") + def isNoDataTile(tile: Column): TypedColumn[Any, Boolean] = delegate.is_no_data_tile(tile) + + /** Compute cell-local aggregate descriptive statistics for a column of Tiles. */ + @deprecated("Part of 0.7.x compatility kit, to be removed after 0.8.x. Please use \"snake_case\" variant instead.", "0.8.0") + def localAggStats(col: Column): Column = delegate.agg_local_stats(col) + + /** Compute the cell-wise/local max operation between Tiles in a column. */ + @deprecated("Part of 0.7.x compatility kit, to be removed after 0.8.x. Please use \"snake_case\" variant instead.", "0.8.0") + def localAggMax(col: Column): TypedColumn[Any, Tile] = delegate.agg_local_max(col) + + /** Compute the cellwise/local min operation between Tiles in a column. */ + @deprecated("Part of 0.7.x compatility kit, to be removed after 0.8.x. Please use \"snake_case\" variant instead.", "0.8.0") + def localAggMin(col: Column): TypedColumn[Any, Tile] = delegate.agg_local_min(col) + + /** Compute the cellwise/local mean operation between Tiles in a column. */ + @deprecated("Part of 0.7.x compatility kit, to be removed after 0.8.x. Please use \"snake_case\" variant instead.", "0.8.0") + def localAggMean(col: Column): TypedColumn[Any, Tile] = delegate.agg_local_mean(col) + + /** Compute the cellwise/local count of non-NoData cells for all Tiles in a column. */ + @deprecated("Part of 0.7.x compatility kit, to be removed after 0.8.x. Please use \"snake_case\" variant instead.", "0.8.0") + def localAggDataCells(col: Column): TypedColumn[Any, Tile] = delegate.agg_local_data_cells(col) + + /** Compute the cellwise/local count of NoData cells for all Tiles in a column. */ + @deprecated("Part of 0.7.x compatility kit, to be removed after 0.8.x. Please use \"snake_case\" variant instead.", "0.8.0") + def localAggNoDataCells(col: Column): TypedColumn[Any, Tile] = delegate.agg_local_no_data_cells(col) + + /** Cellwise addition between two Tiles. */ + @deprecated("Part of 0.7.x compatility kit, to be removed after 0.8.x. Please use \"snake_case\" variant instead.", "0.8.0") + def localAdd(left: Column, right: Column): Column = delegate.local_add(left, right) + + /** Cellwise addition of a scalar to a tile. */ + @deprecated("Part of 0.7.x compatility kit, to be removed after 0.8.x. Please use \"snake_case\" variant instead.", "0.8.0") + def localAddScalar[T: Numeric](tileCol: Column, value: T): TypedColumn[Any, Tile] = delegate.local_add(tileCol, value) + + /** Cellwise subtraction between two Tiles. */ + @deprecated("Part of 0.7.x compatility kit, to be removed after 0.8.x. Please use \"snake_case\" variant instead.", "0.8.0") + def localSubtract(left: Column, right: Column): Column = delegate.local_subtract(left, right) + + /** Cellwise subtraction of a scalar from a tile. */ + @deprecated("Part of 0.7.x compatility kit, to be removed after 0.8.x. Please use \"snake_case\" variant instead.", "0.8.0") + def localSubtractScalar[T: Numeric](tileCol: Column, value: T): TypedColumn[Any, Tile] = delegate.local_subtract(tileCol, value) + /** Cellwise multiplication between two Tiles. */ + @deprecated("Part of 0.7.x compatility kit, to be removed after 0.8.x. Please use \"snake_case\" variant instead.", "0.8.0") + def localMultiply(left: Column, right: Column): Column = delegate.local_multiply(left, right) + + /** Cellwise multiplication of a tile by a scalar. */ + @deprecated("Part of 0.7.x compatility kit, to be removed after 0.8.x. Please use \"snake_case\" variant instead.", "0.8.0") + def localMultiplyScalar[T: Numeric](tileCol: Column, value: T): TypedColumn[Any, Tile] = delegate.local_multiply(tileCol, value) + + /** Cellwise division between two Tiles. */ + @deprecated("Part of 0.7.x compatility kit, to be removed after 0.8.x. Please use \"snake_case\" variant instead.", "0.8.0") + def localDivide(left: Column, right: Column): Column = delegate.local_divide(left, right) + + /** Cellwise division of a tile by a scalar. */ + @deprecated("Part of 0.7.x compatility kit, to be removed after 0.8.x. Please use \"snake_case\" variant instead.", "0.8.0") + def localDivideScalar[T: Numeric](tileCol: Column, value: T): TypedColumn[Any, Tile] = delegate.local_divide(tileCol, value) + /** Perform an arbitrary GeoTrellis `LocalTileBinaryOp` between two Tile columns. */ + @deprecated("Part of 0.7.x compatility kit, to be removed after 0.8.x. Please use \"snake_case\" variant instead.", "0.8.0") + def localAlgebra(op: LocalTileBinaryOp, left: Column, right: Column): + TypedColumn[Any, Tile] = + withAlias(opName(op), left, right)( + udf[Tile, Tile, Tile](op.apply).apply(left, right) + ).as[Tile] + + /** Compute the normalized difference of two tile columns */ + @deprecated("Part of 0.7.x compatility kit, to be removed after 0.8.x. Please use \"snake_case\" variant instead.", "0.8.0") + def normalizedDifference(left: Column, right: Column): TypedColumn[Any, Tile] = delegate.normalized_difference(left, right) + + /** Constructor for constant tile column */ + @deprecated("Part of 0.7.x compatility kit, to be removed after 0.8.x. Please use \"snake_case\" variant instead.", "0.8.0") + def makeConstantTile(value: Number, cols: Int, rows: Int, cellType: String): TypedColumn[Any, Tile] = + udf(() => F.makeConstantTile(value, cols, rows, cellType)).apply().as(s"constant_$cellType").as[Tile] + + /** Alias for column of constant tiles of zero */ + @deprecated("Part of 0.7.x compatility kit, to be removed after 0.8.x. Please use \"snake_case\" variant instead.", "0.8.0") + def tileZeros(cols: Int, rows: Int, cellType: String = "float64"): TypedColumn[Any, Tile] = + udf(() => F.tileZeros(cols, rows, cellType)).apply().as(s"zeros_$cellType").as[Tile] + + /** Alias for column of constant tiles of one */ + @deprecated("Part of 0.7.x compatility kit, to be removed after 0.8.x. Please use \"snake_case\" variant instead.", "0.8.0") + def tileOnes(cols: Int, rows: Int, cellType: String = "float64"): TypedColumn[Any, Tile] = + udf(() => F.tileOnes(cols, rows, cellType)).apply().as(s"ones_$cellType").as[Tile] + + /** Where the mask tile equals the mask value, replace values in the source tile with NODATA */ + @deprecated("Part of 0.7.x compatility kit, to be removed after 0.8.x. Please use \"snake_case\" variant instead.", "0.8.0") + def maskByValue(sourceTile: Column, maskTile: Column, maskValue: Column): TypedColumn[Any, Tile] = + delegate.mask_by_value(sourceTile, maskTile, maskValue) + + /** Where the mask tile DOES NOT contain NODATA, replace values in the source tile with NODATA */ + @deprecated("Part of 0.7.x compatility kit, to be removed after 0.8.x. Please use \"snake_case\" variant instead.", "0.8.0") + def inverseMask(sourceTile: Column, maskTile: Column): TypedColumn[Any, Tile] = + delegate.inverse_mask(sourceTile, maskTile) + + /** Reproject a column of geometry from one CRS to another. */ + @deprecated("Part of 0.7.x compatility kit, to be removed after 0.8.x. Please use \"snake_case\" variant instead.", "0.8.0") + def reprojectGeometry(sourceGeom: Column, srcCRS: CRS, dstCRS: CRS): TypedColumn[Any, Geometry] = + delegate.reproject_geometry(sourceGeom, srcCRS, dstCRS) + + /** Render Tile as ASCII string for debugging purposes. */ + @Experimental + @deprecated("Part of 0.7.x compatility kit, to be removed after 0.8.x. Please use \"snake_case\" variant instead.", "0.8.0") + def renderAscii(col: Column): TypedColumn[Any, String] = delegate.render_ascii(col) + + /** Cellwise less than value comparison between two tiles. */ + @deprecated("Part of 0.7.x compatility kit, to be removed after 0.8.x. Please use \"snake_case\" variant instead.", "0.8.0") + def localLess(left: Column, right: Column): TypedColumn[Any, Tile] = + delegate.local_less(left, right) + + + /** Cellwise less than value comparison between a tile and a scalar. */ + @deprecated("Part of 0.7.x compatility kit, to be removed after 0.8.x. Please use \"snake_case\" variant instead.", "0.8.0") + def localLessScalar[T: Numeric](tileCol: Column, value: T): TypedColumn[Any, Tile] = delegate.local_less(tileCol, value) + + /** Cellwise less than or equal to value comparison between a tile and a scalar. */ + @deprecated("Part of 0.7.x compatility kit, to be removed after 0.8.x. Please use \"snake_case\" variant instead.", "0.8.0") + def localLessEqual(left: Column, right: Column): TypedColumn[Any, Tile] = delegate.local_less_equal(left, right) + + /** Cellwise less than or equal to value comparison between a tile and a scalar. */ + @deprecated("Part of 0.7.x compatility kit, to be removed after 0.8.x. Please use \"snake_case\" variant instead.", "0.8.0") + def localLessEqualScalar[T: Numeric](tileCol: Column, value: T): TypedColumn[Any, Tile] = delegate.local_less_equal(tileCol, value) + + /** Cellwise greater than value comparison between two tiles. */ + @deprecated("Part of 0.7.x compatility kit, to be removed after 0.8.x. Please use \"snake_case\" variant instead.", "0.8.0") + def localGreater(left: Column, right: Column): TypedColumn[Any, Tile] = + delegate.local_greater(left, right) + + /** Cellwise greater than value comparison between a tile and a scalar. */ + @deprecated("Part of 0.7.x compatility kit, to be removed after 0.8.x. Please use \"snake_case\" variant instead.", "0.8.0") + def localGreaterScalar[T: Numeric](tileCol: Column, value: T): TypedColumn[Any, Tile] = delegate.local_greater(tileCol, value) + + /** Cellwise greater than or equal to value comparison between two tiles. */ + @deprecated("Part of 0.7.x compatility kit, to be removed after 0.8.x. Please use \"snake_case\" variant instead.", "0.8.0") + def localGreaterEqual(left: Column, right: Column): TypedColumn[Any, Tile] = delegate.local_greater_equal(left, right) + + /** Cellwise greater than or equal to value comparison between a tile and a scalar. */ + @deprecated("Part of 0.7.x compatility kit, to be removed after 0.8.x. Please use \"snake_case\" variant instead.", "0.8.0") + def localGreaterEqualScalar[T: Numeric](tileCol: Column, value: T): TypedColumn[Any, Tile] = delegate.local_greater_equal(tileCol, value) + + /** Cellwise equal to value comparison between two tiles. */ + @deprecated("Part of 0.7.x compatility kit, to be removed after 0.8.x. Please use \"snake_case\" variant instead.", "0.8.0") + def localEqual(left: Column, right: Column): TypedColumn[Any, Tile] = delegate.local_equal(left, right) + + /** Cellwise equal to value comparison between a tile and a scalar. */ + @deprecated("Part of 0.7.x compatility kit, to be removed after 0.8.x. Please use \"snake_case\" variant instead.", "0.8.0") + def localEqualScalar[T: Numeric](tileCol: Column, value: T): TypedColumn[Any, Tile] = delegate.local_equal(tileCol, value) + + /** Cellwise inequality comparison between two tiles. */ + @deprecated("Part of 0.7.x compatility kit, to be removed after 0.8.x. Please use \"snake_case\" variant instead.", "0.8.0") + def localUnequal(left: Column, right: Column): TypedColumn[Any, Tile] = delegate.local_unequal(left, right) + + /** Cellwise inequality comparison between a tile and a scalar. */ + @deprecated("Part of 0.7.x compatility kit, to be removed after 0.8.x. Please use \"snake_case\" variant instead.", "0.8.0") + def localUnequalScalar[T: Numeric](tileCol: Column, value: T): TypedColumn[Any, Tile] = delegate.local_unequal(tileCol, value) } - /** Cellwise greater than or equal to value comparison between two tiles. */ - @deprecated("Part of 0.7.x compatility kit, to be removed after 0.8.x. Please use \"snake_case\" variant instead.", "0.8.0") - def localGreaterEqual(left: Column, right: Column): TypedColumn[Any, Tile] = - withAlias("localGreaterEqual", left, right)( - udf(F.localGreaterEqual).apply(left, right) - ).as[Tile] - - /** Cellwise greater than or equal to value comparison between a tile and a scalar. */ - @deprecated("Part of 0.7.x compatility kit, to be removed after 0.8.x. Please use \"snake_case\" variant instead.", "0.8.0") - def localGreaterEqualScalar[T: Numeric](tileCol: Column, value: T): TypedColumn[Any, Tile] = { - val f = value match{ - case i: Int => F.localGreaterEqualScalarInt(_: Tile, i) - case d: Double => F.localGreaterEqualScalar(_: Tile, d) - } - udf(f).apply(tileCol).as(s"localGreaterEqualScalar($tileCol, $value)").as[Tile] - } - - /** Cellwise equal to value comparison between two tiles. */ - @deprecated("Part of 0.7.x compatility kit, to be removed after 0.8.x. Please use \"snake_case\" variant instead.", "0.8.0") - def localEqual(left: Column, right: Column): TypedColumn[Any, Tile] = - withAlias("localEqual", left, right)( - udf(F.localEqual).apply(left, right) - ).as[Tile] - - /** Cellwise equal to value comparison between a tile and a scalar. */ - @deprecated("Part of 0.7.x compatility kit, to be removed after 0.8.x. Please use \"snake_case\" variant instead.", "0.8.0") - def localEqualScalar[T: Numeric](tileCol: Column, value: T): TypedColumn[Any, Tile] = { - val f = value match{ - case i: Int => F.localEqualScalarInt(_: Tile, i) - case d: Double => F.localEqualScalar(_: Tile, d) - } - udf(f).apply(tileCol).as(s"localEqualScalar($tileCol, $value)").as[Tile] - } - /** Cellwise inequality comparison between two tiles. */ - @deprecated("Part of 0.7.x compatility kit, to be removed after 0.8.x. Please use \"snake_case\" variant instead.", "0.8.0") - def localUnequal(left: Column, right: Column): TypedColumn[Any, Tile] = - withAlias("localUnequal", left, right)( - udf(F.localUnequal).apply(left, right) - ).as[Tile] - - /** Cellwise inequality comparison between a tile and a scalar. */ - @deprecated("Part of 0.7.x compatility kit, to be removed after 0.8.x. Please use \"snake_case\" variant instead.", "0.8.0") - def localUnequalScalar[T: Numeric](tileCol: Column, value: T): TypedColumn[Any, Tile] = { - val f = value match{ - case i: Int => F.localUnequalScalarInt(_: Tile, i) - case d: Double => F.localUnequalScalar(_: Tile, d) - } - udf(f).apply(tileCol).as(s"localUnequalScalar($tileCol, $value)").as[Tile] - } -} - @deprecated("Part of 0.7.x compatility kit, to be removed after 0.8.x.", "0.8.0") def register(sqlContext: SQLContext): Unit = { /** Unary expression builder builder. */ - def ub[A, B](f: A ⇒ B)(a: Seq[A]): B = f(a.head) + def ub[A, B](f: A => B)(a: Seq[A]): B = f(a.head) /** Binary expression builder builder. */ - def bb[A, B](f: (A, A) ⇒ B)(a: Seq[A]): B = f(a.head, a.last) + def bb[A, B](f: (A, A) => B)(a: Seq[A]): B = f(a.head, a.last) // Expression-oriented functions have a different registration scheme // Currently have to register with the `builtin` registry due to Spark data hiding. val registry: FunctionRegistry = rf.registry(sqlContext) - VersionShims.registerExpression(registry, "rf_explodeTiles", E.ExplodeTiles.apply(1.0, None, _)) - VersionShims.registerExpression(registry, "rf_cellType", ub(E.GetCellType.apply)) - VersionShims.registerExpression(registry, "rf_convertCellType", bb(E.SetCellType.apply)) - VersionShims.registerExpression(registry, "rf_tileDimensions", ub(E.GetDimensions.apply)) - VersionShims.registerExpression(registry, "rf_boundsGeometry", ub(E.BoundsToGeometry.apply)) - + registry.registerFunc("rf_explodeTiles", ExplodeTiles.apply(1.0, None, _)) + registry.registerFunc("rf_cellType", ub(GetCellType.apply)) + registry.registerFunc("rf_convertCellType", bb(SetCellType.apply)) + registry.registerFunc("rf_tileDimensions", ub(GetDimensions.apply)) + registry.registerFunc("rf_boundsGeometry", ub(BoundsToGeometry.apply)) + registry.registerFunc("rf_localAdd", bb(Add.apply)) + registry.registerFunc("rf_localSubtract", bb(Subtract.apply)) + registry.registerFunc("rf_localMultiply", bb(Multiply.apply)) + registry.registerFunc("rf_localDivide", bb(Divide.apply)) + registry.registerFunc("rf_normalizedDifference", bb(NormalizedDifference.apply)) + registry.registerFunc("rf_localLess", bb(Less.apply)) + registry.registerFunc("rf_localLessEqual", bb(LessEqual.apply)) + registry.registerFunc("rf_localGreater", bb(Greater.apply)) + registry.registerFunc("rf_localGreaterEqual", bb(GreaterEqual.apply)) + registry.registerFunc("rf_localEqual", bb(Equal.apply)) + registry.registerFunc("rf_localUnequal", bb(Unequal.apply)) + registry.registerFunc("rf_tileSum", ub(Sum.apply)) + registry.registerFunc("rf_dataCells", ub(DataCells.apply)) + registry.registerFunc("rf_noDataCells", ub(NoDataCells.apply)) + registry.registerFunc("rf_isNoDataTile", ub(IsNoDataTile.apply)) + registry.registerFunc("rf_tileMin", ub(TileMin.apply)) + registry.registerFunc("rf_tileMax", ub(TileMax.apply)) + registry.registerFunc("rf_tileMean", ub(TileMean.apply)) + registry.registerFunc("rf_tileStats", ub(TileStats.apply)) + registry.registerFunc("rf_tileHistogram", ub(TileHistogram.apply)) + registry.registerFunc("rf_aggStats", ub(CellStatsAggregate.CellStatsAggregateUDAF.apply)) + registry.registerFunc("rf_aggHistogram", ub(HistogramAggregate.HistogramAggregateUDAF.apply)) + registry.registerFunc("rf_localAggStats", ub(LocalStatsAggregate.LocalStatsAggregateUDAF.apply)) + registry.registerFunc("rf_renderAscii", ub(DebugRender.RenderMatrix.apply)) + registry.registerFunc("rf_localAggMax", ub(LocalTileOpAggregate.LocalMaxUDAF.apply)) + registry.registerFunc("rf_localAggMin", ub(LocalTileOpAggregate.LocalMinUDAF.apply)) + registry.registerFunc("rf_localAggCount", ub(LocalCountAggregate.LocalDataCellsUDAF.apply)) + registry.registerFunc("rf_localAggMean", ub(LocalMeanAggregate.apply)) - sqlContext.udf.register("rf_maskByValue", F.maskByValue) - sqlContext.udf.register("rf_inverseMask", F.inverseMask) sqlContext.udf.register("rf_makeConstantTile", F.makeConstantTile) sqlContext.udf.register("rf_tileZeros", F.tileZeros) sqlContext.udf.register("rf_tileOnes", F.tileOnes) - sqlContext.udf.register("rf_tileToArrayInt", F.tileToArray[Int]) - sqlContext.udf.register("rf_tileToArrayDouble", F.tileToArray[Double]) - sqlContext.udf.register("rf_aggHistogram", F.aggHistogram) - sqlContext.udf.register("rf_aggStats", F.aggStats) - sqlContext.udf.register("rf_tileMin", F.tileMin) - sqlContext.udf.register("rf_tileMax", F.tileMax) - sqlContext.udf.register("rf_tileMean", F.tileMean) - sqlContext.udf.register("rf_tileSum", F.tileSum) - sqlContext.udf.register("rf_tileHistogram", F.tileHistogram) - sqlContext.udf.register("rf_tileStats", F.tileStats) - sqlContext.udf.register("rf_dataCells", F.dataCells) - sqlContext.udf.register("rf_noDataCells", F.noDataCells) - sqlContext.udf.register("rf_isNoDataTile", F.isNoDataTile) - sqlContext.udf.register("rf_localAggStats", F.localAggStats) - sqlContext.udf.register("rf_localAggMax", F.localAggMax) - sqlContext.udf.register("rf_localAggMin", F.localAggMin) - sqlContext.udf.register("rf_localAggMean", F.localAggMean) - sqlContext.udf.register("rf_localAggCount", F.localAggCount) - sqlContext.udf.register("rf_localAdd", F.localAdd) - sqlContext.udf.register("rf_localAddScalar", F.localAddScalar) - sqlContext.udf.register("rf_localAddScalarInt", F.localAddScalarInt) - sqlContext.udf.register("rf_localSubtract", F.localSubtract) - sqlContext.udf.register("rf_localSubtractScalar", F.localSubtractScalar) - sqlContext.udf.register("rf_localSubtractScalarInt", F.localSubtractScalarInt) - sqlContext.udf.register("rf_localMultiply", F.localMultiply) - sqlContext.udf.register("rf_localMultiplyScalar", F.localMultiplyScalar) - sqlContext.udf.register("rf_localMultiplyScalarInt", F.localMultiplyScalarInt) - sqlContext.udf.register("rf_localDivide", F.localDivide) - sqlContext.udf.register("rf_localDivideScalar", F.localDivideScalar) - sqlContext.udf.register("rf_localDivideScalarInt", F.localDivideScalarInt) - sqlContext.udf.register("rf_normalizedDifference", F.normalizedDifference) sqlContext.udf.register("rf_cellTypes", F.cellTypes) - sqlContext.udf.register("rf_renderAscii", F.renderAscii) - sqlContext.udf.register("rf_lessScalar", F.localLessScalar) - sqlContext.udf.register("rf_lessScalarInt", F.localLessScalarInt) - sqlContext.udf.register("rf_lessEqual", F.localLessEqual) - sqlContext.udf.register("rf_lessEqualScalar", F.localLessEqualScalar) - sqlContext.udf.register("rf_lessEqualScalarInt", F.localLessEqualScalarInt) - sqlContext.udf.register("rf_greater", F.localGreater) - sqlContext.udf.register("rf_greaterScalar", F.localGreaterScalar) - sqlContext.udf.register("rf_greaterScalarInt", F.localGreaterScalarInt) - sqlContext.udf.register("rf_greaterEqual", F.localGreaterEqual) - sqlContext.udf.register("rf_greaterEqualScalar", F.localGreaterEqualScalar) - sqlContext.udf.register("rf_greaterEqualScalarInt", F.localGreaterEqualScalarInt) - sqlContext.udf.register("rf_equal", F.localEqual) - sqlContext.udf.register("rf_equalScalar", F.localEqualScalar) - sqlContext.udf.register("rf_equalScalarInt", F.localEqualScalarInt) - sqlContext.udf.register("rf_unequal", F.localUnequal) - sqlContext.udf.register("rf_unequalScalar", F.localUnequalScalar) - sqlContext.udf.register("rf_unequalScalarInt", F.localUnequalScalarInt) sqlContext.udf.register("rf_reprojectGeometry", F.reprojectGeometryCRSName) } } diff --git a/core/src/main/scala/astraea/spark/rasterframes/util/package.scala b/core/src/main/scala/astraea/spark/rasterframes/util/package.scala index 7713a0760..02a365cea 100644 --- a/core/src/main/scala/astraea/spark/rasterframes/util/package.scala +++ b/core/src/main/scala/astraea/spark/rasterframes/util/package.scala @@ -20,7 +20,8 @@ package astraea.spark.rasterframes import geotrellis.proj4.CRS -import geotrellis.raster.CellGrid +import geotrellis.raster +import geotrellis.raster.{CellGrid, Tile, isNoData} import geotrellis.raster.crop.TileCropMethods import geotrellis.raster.io.geotiff.reader.GeoTiffReader import geotrellis.raster.mapalgebra.local.LocalTileBinaryOp @@ -36,6 +37,7 @@ import org.apache.spark.sql.catalyst.rules.Rule import org.apache.spark.sql.rf._ import org.apache.spark.sql.types.StringType import org.apache.spark.sql.{Column, DataFrame, SQLContext} +import spire.syntax.cfor._ import scala.Boolean.box @@ -89,9 +91,10 @@ package object util extends LazyLogging { object CRSParser { def apply(value: String): CRS = { value match { - case e if e.startsWith("EPSG") => CRS.fromName(e) - case p if p.startsWith("+proj") => CRS.fromString(p) - case w if w.startsWith("GEOGCS") => CRS.fromWKT(w) + case e if e.toUpperCase().startsWith("EPSG") => CRS.fromName(e) //not case-sensitive + case p if p.startsWith("+proj") => CRS.fromString(p) // case sensitive + case w if w.toUpperCase().startsWith("GEOGCS") => CRS.fromWKT(w) //only case-sensitive inside double quotes + case _ ⇒ throw new IllegalArgumentException("crs string must be either EPSG code, +proj string, or OGC WKT") } } } @@ -153,6 +156,31 @@ package object util extends LazyLogging { analyzer(sqlContext).extendedResolutionRules } + implicit class TileAsMatrix(val tile: Tile) extends AnyVal { + def renderMatrix(significantDigits: Int): String = { + val ND = s"%${significantDigits+5}s".format(Double.NaN) + val fmt = s"% ${significantDigits+5}.${significantDigits}g" + val buf = new StringBuilder("[") + cfor(0)(_ < tile.rows, _ + 1) { row => + if(row > 0) buf.append(' ') + buf.append('[') + cfor(0)(_ < tile.cols, _ + 1) { col => + val v = tile.getDouble(col, row) + if (isNoData(v)) buf.append(ND) + else buf.append(fmt.format(v)) + + if (col < tile.cols - 1) + buf.append(',') + } + buf.append(']') + if (row < tile.rows - 1) + buf.append(",\n") + } + buf.append("]") + buf.toString() + } + } + object Shims { // GT 1.2.1 to 2.0.0 def toArrayTile[T <: CellGrid](tile: T): T = diff --git a/core/src/main/scala/org/apache/spark/sql/rf/TileUDT.scala b/core/src/main/scala/org/apache/spark/sql/rf/TileUDT.scala index 6b95d359e..75ac0f7cf 100644 --- a/core/src/main/scala/org/apache/spark/sql/rf/TileUDT.scala +++ b/core/src/main/scala/org/apache/spark/sql/rf/TileUDT.scala @@ -23,11 +23,9 @@ package org.apache.spark.sql.rf import astraea.spark.rasterframes.encoders.CatalystSerializer import astraea.spark.rasterframes.encoders.CatalystSerializer._ -import astraea.spark.rasterframes.ref.RasterRef -import astraea.spark.rasterframes.ref.RasterRef.RasterRefTile -import astraea.spark.rasterframes.tiles.{InternalRowTile, ProjectedRasterTile} +import astraea.spark.rasterframes.model.{Cells, TileDataContext} +import astraea.spark.rasterframes.tiles.InternalRowTile import geotrellis.raster._ -import org.apache.spark.sql.Row import org.apache.spark.sql.catalyst.InternalRow import org.apache.spark.sql.types.{DataType, _} @@ -72,68 +70,30 @@ class TileUDT extends UserDefinedType[Tile] { case object TileUDT { UDTRegistration.register(classOf[Tile].getName, classOf[TileUDT].getName) - UDTRegistration.register(classOf[ProjectedRasterTile].getName, classOf[TileUDT].getName) final val typeName: String = "tile" - // Column mapping which must match layout below - object C { - val CELL_TYPE = 0 - val COLS = 1 - val ROWS = 2 - val CELLS = 3 - val REF = 4 - } - implicit def tileSerializer: CatalystSerializer[Tile] = new CatalystSerializer[Tile] { import scala.language.reflectiveCalls override def schema: StructType = StructType(Seq( - StructField("cell_type", StringType, false), - StructField("cols", ShortType, false), - StructField("rows", ShortType, false), - StructField("cells", BinaryType, true), - StructField("ref", CatalystSerializer[RasterRef].schema, true) + StructField("cell_context", CatalystSerializer[TileDataContext].schema, false), + StructField("cell_data", CatalystSerializer[Cells].schema, false) )) - def isRef[R](row: R, io: CatalystIO[R]): Boolean = io.isNullAt(row, C.CELLS) - - override def to[R](t: Tile, io: CatalystIO[R]): R = { - t match { - case ref: RasterRefTile ⇒ - io.create( - io.encode(ref.cellType.name), - ref.cols.toShort, - ref.rows.toShort, - null, - io.to(ref.rr) - ) - case _ ⇒ - io.create( - io.encode(t.cellType.name), - t.cols.toShort, - t.rows.toShort, - t.toBytes, - null - ) - } - } + override def to[R](t: Tile, io: CatalystIO[R]): R = io.create( + io.to(TileDataContext(t)), + io.to(Cells(t)) + ) + override def from[R](row: R, io: CatalystIO[R]): Tile = { + val cells = io.get[Cells](row, 1) + row match { - case ir: InternalRow if !isRef(row, io) ⇒ new InternalRowTile(ir) + case ir: InternalRow if !cells.isRef ⇒ new InternalRowTile(ir) case _ ⇒ - if(isRef(row, io)) { - val ref = io.get[RasterRef](row, C.REF) - RasterRefTile(ref) - } - else { - val ct = CellType.fromName(io.getString(row, C.CELL_TYPE)) - val cols = io.getShort(row, C.COLS) - val rows = io.getShort(row, C.ROWS) - - val data = io.getByteArray(row, 3) - ArrayTile.fromBytes(data, ct, cols, rows) - } + val ctx = io.get[TileDataContext](row, 0) + cells.toTile(ctx) } } } diff --git a/core/src/main/scala/org/apache/spark/sql/rf/VersionShims.scala b/core/src/main/scala/org/apache/spark/sql/rf/VersionShims.scala index 3a05af0a4..b9eb96981 100644 --- a/core/src/main/scala/org/apache/spark/sql/rf/VersionShims.scala +++ b/core/src/main/scala/org/apache/spark/sql/rf/VersionShims.scala @@ -2,16 +2,21 @@ package org.apache.spark.sql.rf import java.lang.reflect.{Constructor, Method} +import org.apache.spark.sql.catalyst.FunctionIdentifier import org.apache.spark.sql.catalyst.analysis.FunctionRegistry +import org.apache.spark.sql.catalyst.analysis.FunctionRegistry.{FunctionBuilder, expressionInfo} import org.apache.spark.sql.catalyst.catalog.CatalogTable -import org.apache.spark.sql.{DataFrame, Dataset, SQLContext} -import org.apache.spark.sql.catalyst.expressions.{AttributeReference, Expression, ScalaUDF} +import org.apache.spark.sql.{AnalysisException, DataFrame, Dataset, SQLContext} +import org.apache.spark.sql.catalyst.expressions.{AttributeReference, BinaryExpression, Expression, ExpressionDescription, ExpressionInfo, RuntimeReplaceable, ScalaUDF} import org.apache.spark.sql.catalyst.expressions.objects.{Invoke, InvokeLike} import org.apache.spark.sql.catalyst.plans.logical.LogicalPlan import org.apache.spark.sql.execution.datasources.LogicalRelation import org.apache.spark.sql.sources.BaseRelation import org.apache.spark.sql.types.DataType +import scala.reflect._ +import scala.util.{Failure, Success, Try} + /** * Collection of Spark version compatibility adapters. * @@ -102,24 +107,91 @@ object VersionShims { } } - def registerExpression(registry: FunctionRegistry, name: String, builder: FunctionRegistry.FunctionBuilder): Unit = { - // Spark 2.3 introduced a new way of specifying Functions - val spark23FI = "org.apache.spark.sql.catalyst.FunctionIdentifier" - registry.getClass.getDeclaredMethods - .filter(m ⇒ m.getName == "registerFunction" && m.getParameterCount == 2) - .foreach { m ⇒ - val firstParam = m.getParameterTypes()(0) - if(firstParam == classOf[String]) - m.invoke(registry, name, builder) - else if(firstParam.getName == spark23FI) { - val fic = Class.forName(spark23FI) - val ctor = fic.getConstructor(classOf[String], classOf[Option[_]]) - val fi = ctor.newInstance(name, None).asInstanceOf[Object] - m.invoke(registry, fi, builder) + implicit class RichFunctionRegistry(registry: FunctionRegistry) { + + def registerFunc(name: String, builder: FunctionRegistry.FunctionBuilder): Unit = { + // Spark 2.3 introduced a new way of specifying Functions + val spark23FI = "org.apache.spark.sql.catalyst.FunctionIdentifier" + registry.getClass.getDeclaredMethods + .filter(m ⇒ m.getName == "registerFunction" && m.getParameterCount == 2) + .foreach { m ⇒ + val firstParam = m.getParameterTypes()(0) + if(firstParam == classOf[String]) + m.invoke(registry, name, builder) + else if(firstParam.getName == spark23FI) { + val fic = Class.forName(spark23FI) + val ctor = fic.getConstructor(classOf[String], classOf[Option[_]]) + val fi = ctor.newInstance(name, None).asInstanceOf[Object] + m.invoke(registry, fi, builder) + } + else { + throw new NotImplementedError("Unexpected FunctionRegistry API: " + m.toGenericString) + } } - else { - throw new NotImplementedError("Unexpected FunctionRegistry API: " + m.toGenericString) + } + + // Much of the code herein is copied from org.apache.spark.sql.catalyst.analysis.FunctionRegistry + def registerExpression[T <: Expression: ClassTag](name: String): Unit = { + val clazz = classTag[T].runtimeClass + + def expressionInfo: ExpressionInfo = { + val df = clazz.getAnnotation(classOf[ExpressionDescription]) + if (df != null) { + if (df.extended().isEmpty) { + new ExpressionInfo(clazz.getCanonicalName, null, name, df.usage(), df.arguments(), df.examples(), df.note(), df.since()) + } else { + // This exists for the backward compatibility with old `ExpressionDescription`s defining + // the extended description in `extended()`. + new ExpressionInfo(clazz.getCanonicalName, null, name, df.usage(), df.extended()) + } + } else { + new ExpressionInfo(clazz.getCanonicalName, name) } } + def findBuilder: FunctionBuilder = { + val constructors = clazz.getConstructors + // See if we can find a constructor that accepts Seq[Expression] + val varargCtor = constructors.find(_.getParameterTypes.toSeq == Seq(classOf[Seq[_]])) + val builder = (expressions: Seq[Expression]) => { + if (varargCtor.isDefined) { + // If there is an apply method that accepts Seq[Expression], use that one. + Try(varargCtor.get.newInstance(expressions).asInstanceOf[Expression]) match { + case Success(e) => e + case Failure(e) => + // the exception is an invocation exception. To get a meaningful message, we need the + // cause. + throw new AnalysisException(e.getCause.getMessage) + } + } else { + // Otherwise, find a constructor method that matches the number of arguments, and use that. + val params = Seq.fill(expressions.size)(classOf[Expression]) + val f = constructors.find(_.getParameterTypes.toSeq == params).getOrElse { + val validParametersCount = constructors + .filter(_.getParameterTypes.forall(_ == classOf[Expression])) + .map(_.getParameterCount).distinct.sorted + val expectedNumberOfParameters = if (validParametersCount.length == 1) { + validParametersCount.head.toString + } else { + validParametersCount.init.mkString("one of ", ", ", " and ") + + validParametersCount.last + } + throw new AnalysisException(s"Invalid number of arguments for function ${clazz.getSimpleName}. " + + s"Expected: $expectedNumberOfParameters; Found: ${params.length}") + } + Try(f.newInstance(expressions : _*).asInstanceOf[Expression]) match { + case Success(e) => e + case Failure(e) => + // the exception is an invocation exception. To get a meaningful message, we need the + // cause. + throw new AnalysisException(e.getCause.getMessage) + } + } + } + + builder + } + + registry.registerFunction(FunctionIdentifier(name), expressionInfo, findBuilder) + } } } diff --git a/core/src/test/resources/log4j.properties b/core/src/test/resources/log4j.properties index 6d7d28723..378ae8e61 100644 --- a/core/src/test/resources/log4j.properties +++ b/core/src/test/resources/log4j.properties @@ -28,7 +28,7 @@ log4j.appender.console.layout.ConversionPattern=%d{yy/MM/dd HH:mm:ss} %p %c{1}: log4j.logger.org.apache.spark.repl.Main=WARN -log4j.logger.org.apache=WARN +log4j.logger.org.apache=ERROR log4j.logger.com.amazonaws=WARN log4j.logger.geotrellis=INFO diff --git a/core/src/test/scala/astraea/spark/rasterframes/ExplodeSpec.scala b/core/src/test/scala/astraea/spark/rasterframes/ExplodeSpec.scala index 2f50fc2f9..a06b6444b 100644 --- a/core/src/test/scala/astraea/spark/rasterframes/ExplodeSpec.scala +++ b/core/src/test/scala/astraea/spark/rasterframes/ExplodeSpec.scala @@ -81,7 +81,7 @@ class ExplodeSpec extends TestEnvironment with TestData { .select($"tile".as[Double]) .collect() - assert(cells.count(_.isNaN) === 1) + cells.count(_.isNaN) should be(1) } it("should handle user-defined NoData values in tile sampler") { @@ -90,7 +90,7 @@ class ExplodeSpec extends TestEnvironment with TestData { .select(explode_tiles($"tile")) .select($"tile".as[Double]) .collect() - assert(cells.count(_.isNaN) === tiles.size) + cells.count(_.isNaN) should be(tiles.size) } it("should convert tile into array") { @@ -99,18 +99,18 @@ class ExplodeSpec extends TestEnvironment with TestData { | rf_make_constant_tile(1, 10, 10, 'int8raw') |) as intArray |""".stripMargin) - assert(query.as[Array[Int]].first.sum === 100) + query.as[Array[Int]].first.sum should be (100) val tile = FloatConstantTile(1.1f, 10, 10, FloatCellType) val df = Seq[Tile](tile).toDF("tile") - val arrayDF = df.select(tile_to_array[Float]($"tile").as[Array[Float]]) - assert(arrayDF.first().sum === 110.0f +- 0.0001f) + val arrayDF = df.select(tile_to_array_double($"tile").as[Array[Double]]) + arrayDF.first().sum should be (110.0 +- 0.0001) } it("should convert an array into a tile") { val tile = FloatConstantTile(1.1f, 10, 10, FloatCellType) val df = Seq[Tile](tile, null).toDF("tile") - val arrayDF = df.withColumn("tileArray", tile_to_array[Float]($"tile")) + val arrayDF = df.withColumn("tileArray", tile_to_array_double($"tile")) val back = arrayDF.withColumn("backToTile", array_to_tile($"tileArray", 10, 10)) diff --git a/core/src/test/scala/astraea/spark/rasterframes/RasterFunctionsSpec.scala b/core/src/test/scala/astraea/spark/rasterframes/RasterFunctionsSpec.scala new file mode 100644 index 000000000..da2ab9c56 --- /dev/null +++ b/core/src/test/scala/astraea/spark/rasterframes/RasterFunctionsSpec.scala @@ -0,0 +1,695 @@ +/* + * This software is licensed under the Apache 2 license, quoted below. + * + * Copyright 2019 Astraea, Inc. + * + * Licensed under the Apache License, Version 2.0 (the "License"); you may not + * use this file except in compliance with the License. You may obtain a copy of + * the License at + * + * [http://www.apache.org/licenses/LICENSE-2.0] + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT + * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the + * License for the specific language governing permissions and limitations under + * the License. + * + * SPDX-License-Identifier: Apache-2.0 + * + */ + +package astraea.spark.rasterframes +import astraea.spark.rasterframes.TestData.injectND +import astraea.spark.rasterframes.expressions.accessors.ExtractTile +import astraea.spark.rasterframes.stats.{CellHistogram, CellStatistics, LocalCellStatistics} +import astraea.spark.rasterframes.tiles.ProjectedRasterTile +import geotrellis.proj4.LatLng +import geotrellis.raster +import geotrellis.raster.testkit.RasterMatchers +import geotrellis.raster.{ArrayTile, BitCellType, ByteUserDefinedNoDataCellType, DoubleConstantNoDataCellType, ShortConstantNoDataCellType, Tile, UByteConstantNoDataCellType} +import geotrellis.vector.Extent +import org.apache.spark.sql.{AnalysisException, Encoders} +import org.apache.spark.sql.functions._ +import org.scalatest.{FunSpec, Matchers} + +class RasterFunctionsSpec extends FunSpec + with TestEnvironment with Matchers with RasterMatchers { + import spark.implicits._ + + val extent = Extent(10, 20, 30, 40) + val crs = LatLng + val ct = ByteUserDefinedNoDataCellType(-2) + val cols = 10 + val rows = cols + val tileSize = cols * rows + val tileCount = 10 + val numND = 4 + lazy val zero = TestData.projectedRasterTile(cols, rows, 0, extent, crs, ct) + lazy val one = TestData.projectedRasterTile(cols, rows, 1, extent, crs, ct) + lazy val two = TestData.projectedRasterTile(cols, rows, 2, extent, crs, ct) + lazy val three = TestData.projectedRasterTile(cols, rows, 3, extent, crs, ct) + lazy val six = ProjectedRasterTile(three * two, three.extent, three.crs) + lazy val nd = TestData.projectedRasterTile(cols, rows, -2, extent, crs, ct) + lazy val randTile = TestData.projectedRasterTile(cols, rows, scala.util.Random.nextInt(), extent, crs, ct) + lazy val randNDTile = TestData.injectND(numND)(randTile) + + lazy val randDoubleTile = TestData.projectedRasterTile(cols, rows, scala.util.Random.nextGaussian(), extent, crs, DoubleConstantNoDataCellType) + lazy val randDoubleNDTile = TestData.injectND(numND)(randDoubleTile) + lazy val randPositiveDoubleTile = TestData.projectedRasterTile(cols, rows, scala.util.Random.nextDouble() + 1e-6, extent, crs, DoubleConstantNoDataCellType) + + val expectedRandNoData: Long = numND * tileCount + val expectedRandData: Long = cols * rows * tileCount - expectedRandNoData + lazy val randNDTilesWithNull = Seq.fill[Tile](tileCount)(injectND(numND)( + TestData.randomTile(cols, rows, UByteConstantNoDataCellType) + )).map(ProjectedRasterTile(_, extent, crs)) :+ null + + implicit val pairEnc = Encoders.tuple(ProjectedRasterTile.prtEncoder, ProjectedRasterTile.prtEncoder) + implicit val tripEnc = Encoders.tuple(ProjectedRasterTile.prtEncoder, ProjectedRasterTile.prtEncoder, ProjectedRasterTile.prtEncoder) + + describe("arithmetic tile operations") { + it("should local_add") { + val df = Seq((one, two)).toDF("one", "two") + + val maybeThree = df.select(local_add($"one", $"two")).as[ProjectedRasterTile] + assertEqual(maybeThree.first(), three) + + assertEqual(df.selectExpr("rf_local_add(one, two)").as[ProjectedRasterTile].first(), three) + + val maybeThreeTile = df.select(local_add(ExtractTile($"one"), ExtractTile($"two"))).as[Tile] + assertEqual(maybeThreeTile.first(), three.toArrayTile()) + checkDocs("rf_local_add") + } + + it("should local_subtract") { + val df = Seq((three, two)).toDF("three", "two") + val maybeOne = df.select(local_subtract($"three", $"two")).as[ProjectedRasterTile] + assertEqual(maybeOne.first(), one) + + assertEqual(df.selectExpr("rf_local_subtract(three, two)").as[ProjectedRasterTile].first(), one) + + val maybeOneTile = + df.select(local_subtract(ExtractTile($"three"), ExtractTile($"two"))).as[Tile] + assertEqual(maybeOneTile.first(), one.toArrayTile()) + checkDocs("rf_local_subtract") + } + + it("should local_multiply") { + val df = Seq((three, two)).toDF("three", "two") + + val maybeSix = df.select(local_multiply($"three", $"two")).as[ProjectedRasterTile] + assertEqual(maybeSix.first(), six) + + assertEqual(df.selectExpr("rf_local_multiply(three, two)").as[ProjectedRasterTile].first(), six) + + val maybeSixTile = + df.select(local_multiply(ExtractTile($"three"), ExtractTile($"two"))).as[Tile] + assertEqual(maybeSixTile.first(), six.toArrayTile()) + checkDocs("rf_local_multiply") + } + + it("should local_divide") { + val df = Seq((six, two)).toDF("six", "two") + val maybeThree = df.select(local_divide($"six", $"two")).as[ProjectedRasterTile] + assertEqual(maybeThree.first(), three) + + assertEqual(df.selectExpr("rf_local_divide(six, two)").as[ProjectedRasterTile].first(), three) + + assertEqual(df.selectExpr("rf_local_multiply(rf_local_divide(six, 2.0), two)") + .as[ProjectedRasterTile].first(), six) + + val maybeThreeTile = + df.select(local_divide(ExtractTile($"six"), ExtractTile($"two"))).as[Tile] + assertEqual(maybeThreeTile.first(), three.toArrayTile()) + checkDocs("rf_local_divide") + } + } + + describe("scalar tile operations") { + it("should local_add") { + val df = Seq(one).toDF("one") + val maybeThree = df.select(local_add($"one", 2)).as[ProjectedRasterTile] + assertEqual(maybeThree.first(), three) + + val maybeThreeD = df.select(local_add($"one", 2.1)).as[ProjectedRasterTile] + assertEqual(maybeThreeD.first(), three.convert(DoubleConstantNoDataCellType).localAdd(0.1)) + + val maybeThreeTile = df.select(local_add(ExtractTile($"one"), 2)).as[Tile] + assertEqual(maybeThreeTile.first(), three.toArrayTile()) + } + + it("should local_subtract") { + val df = Seq(three).toDF("three") + + val maybeOne = df.select(local_subtract($"three", 2)).as[ProjectedRasterTile] + assertEqual(maybeOne.first(), one) + + val maybeOneD = df.select(local_subtract($"three", 2.0)).as[ProjectedRasterTile] + assertEqual(maybeOneD.first(), one) + + val maybeOneTile = df.select(local_subtract(ExtractTile($"three"), 2)).as[Tile] + assertEqual(maybeOneTile.first(), one.toArrayTile()) + } + + it("should local_multiply") { + val df = Seq(three).toDF("three") + + val maybeSix = df.select(local_multiply($"three", 2)).as[ProjectedRasterTile] + assertEqual(maybeSix.first(), six) + + val maybeSixD = df.select(local_multiply($"three", 2.0)).as[ProjectedRasterTile] + assertEqual(maybeSixD.first(), six) + + val maybeSixTile = df.select(local_multiply(ExtractTile($"three"), 2)).as[Tile] + assertEqual(maybeSixTile.first(), six.toArrayTile()) + } + + it("should local_divide") { + val df = Seq(six).toDF("six") + + val maybeThree = df.select(local_divide($"six", 2)).as[ProjectedRasterTile] + assertEqual(maybeThree.first(), three) + + val maybeThreeD = df.select(local_divide($"six", 2.0)).as[ProjectedRasterTile] + assertEqual(maybeThreeD.first(), three) + + val maybeThreeTile = df.select(local_divide(ExtractTile($"six"), 2)).as[Tile] + assertEqual(maybeThreeTile.first(), three.toArrayTile()) + } + } + + describe("tile comparison relations") { + it("should evaluate local_less") { + val df = Seq((two, three, six)).toDF("two", "three", "six") + df.select(tile_sum(local_less($"two", 6))).first() should be(100.0) + df.select(tile_sum(local_less($"two", 1.9))).first() should be(0.0) + df.select(tile_sum(local_less($"two", 2))).first() should be(0.0) + df.select(tile_sum(local_less($"three", $"two"))).first() should be(0.0) + df.select(tile_sum(local_less($"three", $"three"))).first() should be(0.0) + df.select(tile_sum(local_less($"three", $"six"))).first() should be(100.0) + + df.selectExpr("rf_tile_sum(rf_local_less(two, 6))").as[Double].first() should be(100.0) + df.selectExpr("rf_tile_sum(rf_local_less(three, three))").as[Double].first() should be(0.0) + checkDocs("rf_local_less") + } + + it("should evaluate local_less_equal") { + val df = Seq((two, three, six)).toDF("two", "three", "six") + df.select(tile_sum(local_less_equal($"two", 6))).first() should be(100.0) + df.select(tile_sum(local_less_equal($"two", 1.9))).first() should be(0.0) + df.select(tile_sum(local_less_equal($"two", 2))).first() should be(100.0) + df.select(tile_sum(local_less_equal($"three", $"two"))).first() should be(0.0) + df.select(tile_sum(local_less_equal($"three", $"three"))).first() should be(100.0) + df.select(tile_sum(local_less_equal($"three", $"six"))).first() should be(100.0) + + df.selectExpr("rf_tile_sum(rf_local_less_equal(two, 6))").as[Double].first() should be(100.0) + df.selectExpr("rf_tile_sum(rf_local_less_equal(three, three))").as[Double].first() should be(100.0) + checkDocs("rf_local_less_equal") + } + + it("should evaluate local_greater") { + val df = Seq((two, three, six)).toDF("two", "three", "six") + df.select(tile_sum(local_greater($"two", 6))).first() should be(0.0) + df.select(tile_sum(local_greater($"two", 1.9))).first() should be(100.0) + df.select(tile_sum(local_greater($"two", 2))).first() should be(0.0) + df.select(tile_sum(local_greater($"three", $"two"))).first() should be(100.0) + df.select(tile_sum(local_greater($"three", $"three"))).first() should be(0.0) + df.select(tile_sum(local_greater($"three", $"six"))).first() should be(0.0) + + df.selectExpr("rf_tile_sum(rf_local_greater(two, 1.9))").as[Double].first() should be(100.0) + df.selectExpr("rf_tile_sum(rf_local_greater(three, three))").as[Double].first() should be(0.0) + checkDocs("rf_local_greater") + } + + it("should evaluate local_greater_equal") { + val df = Seq((two, three, six)).toDF("two", "three", "six") + df.select(tile_sum(local_greater_equal($"two", 6))).first() should be(0.0) + df.select(tile_sum(local_greater_equal($"two", 1.9))).first() should be(100.0) + df.select(tile_sum(local_greater_equal($"two", 2))).first() should be(100.0) + df.select(tile_sum(local_greater_equal($"three", $"two"))).first() should be(100.0) + df.select(tile_sum(local_greater_equal($"three", $"three"))).first() should be(100.0) + df.select(tile_sum(local_greater_equal($"three", $"six"))).first() should be(0.0) + df.selectExpr("rf_tile_sum(rf_local_greater_equal(two, 1.9))").as[Double].first() should be(100.0) + df.selectExpr("rf_tile_sum(rf_local_greater_equal(three, three))").as[Double].first() should be(100.0) + checkDocs("rf_local_greater_equal") + } + + it("should evaluate local_equal") { + val df = Seq((two, three, three)).toDF("two", "threeA", "threeB") + df.select(tile_sum(local_equal($"two", 2))).first() should be(100.0) + df.select(tile_sum(local_equal($"two", 2.1))).first() should be(0.0) + df.select(tile_sum(local_equal($"two", $"threeA"))).first() should be(0.0) + df.select(tile_sum(local_equal($"threeA", $"threeB"))).first() should be(100.0) + df.selectExpr("rf_tile_sum(rf_local_equal(two, 1.9))").as[Double].first() should be(0.0) + df.selectExpr("rf_tile_sum(rf_local_equal(threeA, threeB))").as[Double].first() should be(100.0) + checkDocs("rf_local_equal") + } + + it("should evaluate local_unequal") { + val df = Seq((two, three, three)).toDF("two", "threeA", "threeB") + df.select(tile_sum(local_unequal($"two", 2))).first() should be(0.0) + df.select(tile_sum(local_unequal($"two", 2.1))).first() should be(100.0) + df.select(tile_sum(local_unequal($"two", $"threeA"))).first() should be(100.0) + df.select(tile_sum(local_unequal($"threeA", $"threeB"))).first() should be(0.0) + df.selectExpr("rf_tile_sum(rf_local_unequal(two, 1.9))").as[Double].first() should be(100.0) + df.selectExpr("rf_tile_sum(rf_local_unequal(threeA, threeB))").as[Double].first() should be(0.0) + checkDocs("rf_local_unequal") + } + } + + describe("per-tile stats") { + it("should compute data cell counts") { + val df = Seq(TestData.injectND(numND)(two)).toDF("two") + df.select(data_cells($"two")).first() shouldBe (cols * rows - numND).toLong + + val df2 = randNDTilesWithNull.toDF("tile") + df2.select(data_cells($"tile") as "cells") + .agg(sum("cells")) + .as[Long] + .first() should be (expectedRandData) + + checkDocs("rf_data_cells") + } + it("should compute no-data cell counts") { + val df = Seq(TestData.injectND(numND)(two)).toDF("two") + df.select(no_data_cells($"two")).first() should be(numND) + + val df2 = randNDTilesWithNull.toDF("tile") + df2.select(no_data_cells($"tile") as "cells") + .agg(sum("cells")) + .as[Long] + .first() should be (expectedRandNoData) + + checkDocs("rf_no_data_cells") + } + it("should detect no-data tiles") { + val df = Seq(nd).toDF("nd") + df.select(is_no_data_tile($"nd")).first() should be(true) + val df2 = Seq(two).toDF("not_nd") + df2.select(is_no_data_tile($"not_nd")).first() should be(false) + checkDocs("rf_is_no_data_tile") + } + it("should find the minimum cell value") { + val min = randNDTile.toArray().filter(c => raster.isData(c)).min.toDouble + val df = Seq(randNDTile).toDF("rand") + df.select(tile_min($"rand")).first() should be(min) + df.selectExpr("rf_tile_min(rand)").as[Double].first() should be(min) + checkDocs("rf_tile_min") + } + + it("should find the maximum cell value") { + val max = randNDTile.toArray().filter(c => raster.isData(c)).max.toDouble + val df = Seq(randNDTile).toDF("rand") + df.select(tile_max($"rand")).first() should be(max) + df.selectExpr("rf_tile_max(rand)").as[Double].first() should be(max) + checkDocs("rf_tile_max") + } + it("should compute the tile mean cell value") { + val values = randNDTile.toArray().filter(c => raster.isData(c)) + val mean = values.sum.toDouble / values.length + val df = Seq(randNDTile).toDF("rand") + df.select(tile_mean($"rand")).first() should be(mean) + df.selectExpr("rf_tile_mean(rand)").as[Double].first() should be(mean) + checkDocs("rf_tile_mean") + } + + it("should compute the tile summary statistics") { + val values = randNDTile.toArray().filter(c => raster.isData(c)) + val mean = values.sum.toDouble / values.length + val df = Seq(randNDTile).toDF("rand") + val stats = df.select(tile_stats($"rand")).first() + stats.mean should be (mean +- 0.00001) + + val stats2 = df.selectExpr("rf_tile_stats(rand) as stats") + .select($"stats".as[CellStatistics]) + .first() + stats2 should be (stats) + + df.select(tile_stats($"rand") as "stats") + .select($"stats.mean").as[Double] + .first() should be(mean +- 0.00001) + df.selectExpr("rf_tile_stats(rand) as stats") + .select($"stats.no_data_cells").as[Long] + .first() should be <= (cols * rows - numND).toLong + + val df2 = randNDTilesWithNull.toDF("tile") + df2 + .select(tile_stats($"tile")("data_cells") as "cells") + .agg(sum("cells")) + .as[Long] + .first() should be (expectedRandData) + + checkDocs("rf_tile_stats") + } + + it("should compute the tile histogram") { + val df = Seq(randNDTile).toDF("rand") + val h1 = df.select(tile_histogram($"rand")).first() + + val h2 = df.selectExpr("rf_tile_histogram(rand) as hist") + .select($"hist".as[CellHistogram]) + .first() + + h1 should be (h2) + + checkDocs("rf_tile_histogram") + } + } + + describe("aggregate statistics") { + it("should count data cells") { + val df = randNDTilesWithNull.filter(_ != null).toDF("tile") + df.select(agg_data_cells($"tile")).first() should be (expectedRandData) + df.selectExpr("rf_agg_data_cells(tile)").as[Long].first() should be (expectedRandData) + + checkDocs("rf_agg_data_cells") + } + it("should count no-data cells") { + val df = randNDTilesWithNull.toDF("tile") + df.select(agg_no_data_cells($"tile")).first() should be (expectedRandNoData) + df.selectExpr("rf_agg_no_data_cells(tile)").as[Long].first() should be (expectedRandNoData) + checkDocs("rf_agg_no_data_cells") + } + + it("should compute aggregate statistics") { + val df = randNDTilesWithNull.toDF("tile") + + df + .select(agg_stats($"tile") as "stats") + .select("stats.data_cells", "stats.no_data_cells") + .as[(Long, Long)] + .first() should be ((expectedRandData, expectedRandNoData)) + df.selectExpr("rf_agg_stats(tile) as stats") + .select("stats.data_cells") + .as[Long] + .first() should be (expectedRandData) + + checkDocs("rf_agg_stats") + } + + it("should compute a aggregate histogram") { + val df = randNDTilesWithNull.toDF("tile") + val hist1 = df.select(agg_approx_histogram($"tile")).first() + val hist2 = df.selectExpr("rf_agg_approx_histogram(tile) as hist") + .select($"hist".as[CellHistogram]) + .first() + hist1 should be (hist2) + checkDocs("rf_agg_approx_histogram") + } + + it("should compute local statistics") { + val df = randNDTilesWithNull.toDF("tile") + val stats1 = df.select(agg_local_stats($"tile")) + .first() + val stats2 = df.selectExpr("rf_agg_local_stats(tile) as stats") + .select($"stats".as[LocalCellStatistics]) + .first() + + stats1 should be (stats2) + checkDocs("rf_agg_local_stats") + } + + it("should compute local min") { + val df = Seq(two, three, one, six).toDF("tile") + df.select(agg_local_min($"tile")).first() should be(one.toArrayTile()) + df.selectExpr("rf_agg_local_min(tile)").as[Tile].first() should be(one.toArrayTile()) + checkDocs("rf_agg_local_min") + } + + it("should compute local max") { + val df = Seq(two, three, one, six).toDF("tile") + df.select(agg_local_max($"tile")).first() should be(six.toArrayTile()) + df.selectExpr("rf_agg_local_max(tile)").as[Tile].first() should be(six.toArrayTile()) + checkDocs("rf_agg_local_max") + } + + it("should compute local data cell counts") { + val df = Seq(two, randNDTile, nd).toDF("tile") + val t1 = df.select(agg_local_data_cells($"tile")).first() + val t2 = df.selectExpr("rf_agg_local_data_cells(tile) as cnt").select($"cnt".as[Tile]).first() + t1 should be (t2) + checkDocs("rf_agg_local_data_cells") + } + + it("should compute local no-data cell counts") { + val df = Seq(two, randNDTile, nd).toDF("tile") + val t1 = df.select(agg_local_no_data_cells($"tile")).first() + val t2 = df.selectExpr("rf_agg_local_no_data_cells(tile) as cnt").select($"cnt".as[Tile]).first() + t1 should be (t2) + val t3 = df.select(local_add(agg_local_data_cells($"tile"), agg_local_no_data_cells($"tile"))).first() + t3 should be(three.toArrayTile()) + checkDocs("rf_agg_local_no_data_cells") + } + } + + describe("analytical transformations") { + it("should compute normalized_difference") { + val df = Seq((three, two)).toDF("three", "two") + + df.select(tile_to_array_double(normalized_difference($"three", $"two"))) + .first() + .forall(_ == 0.2) shouldBe true + + df.selectExpr("rf_tile_to_array_double(rf_normalized_difference(three, two))") + .as[Array[Double]] + .first() + .forall(_ == 0.2) shouldBe true + + checkDocs("rf_normalized_difference") + } + + it("should mask one tile against another") { + val df = Seq[Tile](randTile).toDF("tile") + + val withMask = df.withColumn("mask", + convert_cell_type( + local_greater($"tile", 50), + "uint8") + ) + + val withMasked = withMask.withColumn("masked", + mask($"tile", $"mask")) + + val result = withMasked.agg(agg_no_data_cells($"tile") < agg_no_data_cells($"masked")).as[Boolean] + + result.first() should be(true) + + checkDocs("rf_mask") + } + + it("should inverse mask one tile against another") { + val df = Seq[Tile](randTile).toDF("tile") + + val baseND = df.select(agg_no_data_cells($"tile")).first() + + val withMask = df.withColumn("mask", + convert_cell_type( + local_greater($"tile", 50), + "uint8" + ) + ) + + val withMasked = withMask + .withColumn("masked", mask($"tile", $"mask")) + .withColumn("inv_masked", inverse_mask($"tile", $"mask")) + + val result = withMasked.agg(agg_no_data_cells($"masked") + agg_no_data_cells($"inv_masked")).as[Long] + + result.first() should be(tileSize + baseND) + + checkDocs("rf_inverse_mask") + } + + it("should mask tile by another identified by specified value") { + val df = Seq[Tile](randTile).toDF("tile") + val mask_value = 4 + + val withMask = df.withColumn("mask", + local_multiply(convert_cell_type( + local_greater($"tile", 50), + "uint8"), + lit(mask_value) + ) + ) + + val withMasked = withMask.withColumn("masked", + mask_by_value($"tile", $"mask", lit(mask_value))) + + val result = withMasked.agg(agg_no_data_cells($"tile") < agg_no_data_cells($"masked")).as[Boolean] + + result.first() should be(true) + checkDocs("rf_mask_by_value") + } + + it("should render ascii art") { + val df = Seq[Tile](ProjectedRasterTile(TestData.l8Labels)).toDF("tile") + val r1 = df.select(render_ascii($"tile")) + val r2 = df.selectExpr("rf_render_ascii(tile)").as[String] + r1.first() should be(r2.first()) + checkDocs("rf_render_ascii") + } + + it("should render cells as matrix") { + val df = Seq(randDoubleNDTile).toDF("tile") + val r1 = df.select(render_matrix($"tile")) + val r2 = df.selectExpr("rf_render_matrix(tile)").as[String] + r1.first() should be(r2.first()) + checkDocs("rf_render_matrix") + } + + it("should round tile cell values") { + + val three_plus = TestData.projectedRasterTile(cols, rows, 3.12, extent, crs, DoubleConstantNoDataCellType) + val three_less = TestData.projectedRasterTile(cols, rows, 2.92, extent, crs, DoubleConstantNoDataCellType) + val three_double = TestData.projectedRasterTile(cols, rows, 3.0, extent, crs, DoubleConstantNoDataCellType) + + val df = Seq((three_plus, three_less, three)).toDF("three_plus", "three_less", "three") + + assertEqual(df.select(round($"three")).as[ProjectedRasterTile].first(), three) + assertEqual(df.select(round($"three_plus")).as[ProjectedRasterTile].first(), three_double) + assertEqual(df.select(round($"three_less")).as[ProjectedRasterTile].first(), three_double) + + assertEqual(df.selectExpr("rf_round(three)").as[ProjectedRasterTile].first(), three) + assertEqual(df.selectExpr("rf_round(three_plus)").as[ProjectedRasterTile].first(), three_double) + assertEqual(df.selectExpr("rf_round(three_less)").as[ProjectedRasterTile].first(), three_double) + + checkDocs("rf_round") + } + + it("should take logarithms positive cell values"){ + // log10 1000 == 3 + val thousand = TestData.projectedRasterTile(cols, rows, 1000, extent, crs, ShortConstantNoDataCellType) + val threesDouble = TestData.projectedRasterTile(cols, rows, 3.0, extent, crs, DoubleConstantNoDataCellType) + val zerosDouble = TestData.projectedRasterTile(cols, rows, 0.0, extent, crs, DoubleConstantNoDataCellType) + + val df1 = Seq(thousand).toDF("tile") + assertEqual(df1.select(log10($"tile")).as[ProjectedRasterTile].first(), threesDouble) + + // ln random tile == log10 random tile / log10(e); random tile square to ensure all positive cell values + val df2 = Seq(randPositiveDoubleTile).toDF("tile") + val log10e = math.log10(math.E) + assertEqual(df2.select(log($"tile")).as[ProjectedRasterTile].first(), + df2.select(log10($"tile")).as[ProjectedRasterTile].first() / log10e) + + lazy val maybeZeros = df2 + .selectExpr(s"rf_local_subtract(rf_log(tile), rf_local_divide(rf_log10(tile), ${log10e}))") + .as[ProjectedRasterTile].first() + assertEqual(maybeZeros, zerosDouble) + + // log1p for zeros should be ln(1) + val ln1 = math.log1p(0.0) + val df3 = Seq(zero).toDF("tile") + val maybeLn1 = df3.selectExpr(s"rf_log1p(tile)").as[ProjectedRasterTile].first() + assert(maybeLn1.toArrayDouble().forall(_ == ln1)) + + checkDocs("rf_log") + checkDocs("rf_log2") + checkDocs("rf_log10") + checkDocs("rf_log1p") + } + + it("should take logarithms with non-positive cell values") { + val ni_float = TestData.projectedRasterTile(cols, rows, Double.NegativeInfinity, extent, crs, DoubleConstantNoDataCellType) + val zero_float =TestData.projectedRasterTile(cols, rows, 0.0, extent, crs, DoubleConstantNoDataCellType) + + // tile zeros ==> -Infinity + val df_0 = Seq(zero).toDF("tile") + assertEqual(df_0.select(log($"tile")).as[ProjectedRasterTile].first(), ni_float) + assertEqual(df_0.select(log10($"tile")).as[ProjectedRasterTile].first(), ni_float) + assertEqual(df_0.select(log2($"tile")).as[ProjectedRasterTile].first(), ni_float) + // log1p of zeros should be 0. + assertEqual(df_0.select(log1p($"tile")).as[ProjectedRasterTile].first(), zero_float) + + // tile negative values ==> NaN + assert(df_0.selectExpr("rf_log(rf_local_subtract(tile, 42))").as[ProjectedRasterTile].first().isNoDataTile) + assert(df_0.selectExpr("rf_log2(rf_local_subtract(tile, 42))").as[ProjectedRasterTile].first().isNoDataTile) + assert(df_0.select(log1p(local_subtract($"tile", 42))).as[ProjectedRasterTile].first().isNoDataTile) + assert(df_0.select(log10(local_subtract($"tile", lit(0.01)))).as[ProjectedRasterTile].first().isNoDataTile) + + } + + it("should take exponential") { + val df = Seq(six).toDF("tile") + + // exp inverses log + assertEqual( + df.select(exp(log($"tile"))).as[ProjectedRasterTile].first(), + six + ) + + // base 2 + assertEqual( + df.select(exp2(log2($"tile"))).as[ProjectedRasterTile].first(), + six) + + // base 10 + assertEqual( + df.select(exp10(log10($"tile"))).as[ProjectedRasterTile].first(), + six) + + // plus/minus 1 + assertEqual( + df.select(expm1(log1p($"tile"))).as[ProjectedRasterTile].first(), + six) + + // SQL + assertEqual( + df.selectExpr("rf_exp(rf_log(tile))").as[ProjectedRasterTile].first(), + six) + + // SQL base 10 + assertEqual( + df.selectExpr("rf_exp10(rf_log10(tile))").as[ProjectedRasterTile].first(), + six) + + // SQL base 2 + assertEqual( + df.selectExpr("rf_exp2(rf_log2(tile))").as[ProjectedRasterTile].first(), + six) + + // SQL expm1 + assertEqual( + df.selectExpr("rf_expm1(rf_log1p(tile))").as[ProjectedRasterTile].first(), + six) + + checkDocs("rf_exp") + checkDocs("rf_exp10") + checkDocs("rf_exp2") + checkDocs("rf_expm1") + + } + } + it("should resample") { + def lowRes = { + def base = ArrayTile(Array(1,2,3,4), 2, 2) + ProjectedRasterTile(base.convert(ct), extent, crs) + } + def upsampled = { + def base = ArrayTile(Array( + 1,1,2,2, + 1,1,2,2, + 3,3,4,4, + 3,3,4,4 + ), 4, 4) + ProjectedRasterTile(base.convert(ct), extent, crs) + } + // a 4, 4 tile to upsample by shape + def fourByFour = TestData.projectedRasterTile(4, 4, 0, extent, crs, ct) + + def df = Seq(lowRes).toDF("tile") + + val maybeUp = df.select(resample($"tile", lit(2))).as[ProjectedRasterTile].first() + assertEqual(maybeUp, upsampled) + + def df2 = Seq((lowRes, fourByFour)).toDF("tile1", "tile2") + val maybeUpShape = df2.select(resample($"tile1", $"tile2")).as[ProjectedRasterTile].first() + assertEqual(maybeUpShape, upsampled) + + // Downsample by double argument < 1 + def df3 = Seq(upsampled).toDF("tile").withColumn("factor", lit(0.5)) + assertEqual(df3.selectExpr("rf_resample(tile, 0.5)").as[ProjectedRasterTile].first(), lowRes) + assertEqual(df3.selectExpr("rf_resample(tile, factor)").as[ProjectedRasterTile].first(), lowRes) + + checkDocs("rf_resample") + } +} diff --git a/core/src/test/scala/astraea/spark/rasterframes/ReprojectGeometryTest.scala b/core/src/test/scala/astraea/spark/rasterframes/ReprojectGeometrySpec.scala similarity index 98% rename from core/src/test/scala/astraea/spark/rasterframes/ReprojectGeometryTest.scala rename to core/src/test/scala/astraea/spark/rasterframes/ReprojectGeometrySpec.scala index 25ef32176..39ea3b1c1 100644 --- a/core/src/test/scala/astraea/spark/rasterframes/ReprojectGeometryTest.scala +++ b/core/src/test/scala/astraea/spark/rasterframes/ReprojectGeometrySpec.scala @@ -31,7 +31,7 @@ import org.scalatest.{FunSpec, Matchers} * * @since 11/29/18 */ -class ReprojectGeometryTest extends FunSpec +class ReprojectGeometrySpec extends FunSpec with TestEnvironment with Matchers { import spark.implicits._ diff --git a/core/src/test/scala/astraea/spark/rasterframes/TestData.scala b/core/src/test/scala/astraea/spark/rasterframes/TestData.scala index 2d97bc4df..29c06849e 100644 --- a/core/src/test/scala/astraea/spark/rasterframes/TestData.scala +++ b/core/src/test/scala/astraea/spark/rasterframes/TestData.scala @@ -22,8 +22,10 @@ import java.net.URI import java.nio.file.Paths import java.time.ZonedDateTime +import astraea.spark.rasterframes.expressions.tilestats.NoDataCells +import astraea.spark.rasterframes.model.TileContext import astraea.spark.rasterframes.tiles.ProjectedRasterTile -import astraea.spark.rasterframes.{functions ⇒ F} +import astraea.spark.rasterframes.{functions => F} import com.vividsolutions.jts.geom.{Coordinate, GeometryFactory} import geotrellis.proj4.{CRS, LatLng} import geotrellis.raster @@ -179,9 +181,9 @@ object TestData extends TestData { ) ( z ⇒ if (isNoData(z)) rnd.nextGaussian() else z ) - } while (F.noDataCells(result) != 0L) + } while (NoDataCells.op(result) != 0L) - assert(F.noDataCells(result) == 0L, + assert(NoDataCells.op(result) == 0L, s"Should not have any NoData cells for $cellType:\n${result.asciiDraw()}") result } @@ -205,7 +207,7 @@ object TestData extends TestData { def projectedRasterTile[N: Numeric]( cols: Int, rows: Int, - cellValue: N, + cellValue: => N, extent: Extent, crs: CRS = LatLng, cellType: CellType = ByteConstantNoDataCellType): ProjectedRasterTile = { val num = implicitly[Numeric[N]] @@ -237,11 +239,18 @@ object TestData extends TestData { val targeted = rnd.shuffle(indexes).take(num) def filter(c: Int, r: Int) = targeted.contains(r * t.cols + c) - if(t.cellType.isFloatingPoint) { + val injected = if(t.cellType.isFloatingPoint) { t.mapDouble((c, r, v) ⇒ (if(filter(c,r)) raster.doubleNODATA else v): Double) } else { t.map((c, r, v) ⇒ if(filter(c, r)) raster.NODATA else v) } + +// t match { +// case TileContext(ext, crs) => ProjectedRasterTile(injected, ext, crs) +// case _ => injected +// } + + injected } } diff --git a/core/src/test/scala/astraea/spark/rasterframes/TestEnvironment.scala b/core/src/test/scala/astraea/spark/rasterframes/TestEnvironment.scala index 6b5111170..aaf173014 100644 --- a/core/src/test/scala/astraea/spark/rasterframes/TestEnvironment.scala +++ b/core/src/test/scala/astraea/spark/rasterframes/TestEnvironment.scala @@ -19,11 +19,12 @@ package astraea.spark.rasterframes import java.nio.file.{Files, Paths} +import astraea.spark.rasterframes.encoders.StandardEncoders.PrimitiveEncoders.stringEnc import astraea.spark.rasterframes.ref.RasterSource import astraea.spark.rasterframes.ref.RasterSource.ReadCallback import astraea.spark.rasterframes.util.toParquetFriendlyColumnName import com.vividsolutions.jts.geom.Geometry -import geotrellis.spark.testkit.{TestEnvironment ⇒ GeoTrellisTestEnvironment} +import geotrellis.spark.testkit.{TestEnvironment => GeoTrellisTestEnvironment} import geotrellis.util.LazyLogging import org.apache.spark.SparkContext import org.apache.spark.sql._ @@ -83,6 +84,14 @@ trait TestEnvironment extends FunSpec with GeoTrellisTestEnvironment } def matchGeom(g: Geometry, tolerance: Double) = new GeometryMatcher(g, tolerance) + + def checkDocs(name: String): Unit = { + val docs = sql(s"DESCRIBE FUNCTION EXTENDED $name").as[String].collect().mkString("\n") + docs should include(name) + docs shouldNot include("not found") + docs shouldNot include("null") + docs shouldNot include("N/A") + } } object TestEnvironment { diff --git a/core/src/test/scala/astraea/spark/rasterframes/TileAssemblerSpec.scala b/core/src/test/scala/astraea/spark/rasterframes/TileAssemblerSpec.scala index 4c8cc48ea..29eff421f 100644 --- a/core/src/test/scala/astraea/spark/rasterframes/TileAssemblerSpec.scala +++ b/core/src/test/scala/astraea/spark/rasterframes/TileAssemblerSpec.scala @@ -114,7 +114,7 @@ class TileAssemblerSpec extends TestEnvironment { val expected = df.select(agg_stats($"tile")).first() val result = assembled.select(agg_stats($"tile")).first() - assert(result.copy(noDataCells = expected.noDataCells) === expected) + assert(result.copy(no_data_cells = expected.no_data_cells) === expected) } } diff --git a/core/src/test/scala/astraea/spark/rasterframes/TileStatsSpec.scala b/core/src/test/scala/astraea/spark/rasterframes/TileStatsSpec.scala index e4c5be7ac..781b8290d 100644 --- a/core/src/test/scala/astraea/spark/rasterframes/TileStatsSpec.scala +++ b/core/src/test/scala/astraea/spark/rasterframes/TileStatsSpec.scala @@ -21,6 +21,7 @@ package astraea.spark.rasterframes import astraea.spark.rasterframes.TestData.randomTile import astraea.spark.rasterframes.TestData.fracTile +import astraea.spark.rasterframes.expressions.aggstats.LocalMeanAggregate import astraea.spark.rasterframes.stats.CellHistogram import geotrellis.raster._ import geotrellis.spark._ @@ -34,13 +35,12 @@ import org.apache.spark.sql.functions._ * @since 9/18/17 */ class TileStatsSpec extends TestEnvironment with TestData { - + import sqlContext.implicits._ import TestData.injectND describe("computing statistics over tiles") { //import org.apache.spark.sql.execution.debug._ it("should report dimensions") { - import sqlContext.implicits._ val df = Seq[(Tile, Tile)]((byteArrayTile, byteArrayTile)).toDF("tile1", "tile2") val dims = df.select(tile_dimensions($"tile1") as "dims").select("dims.*") @@ -48,8 +48,7 @@ class TileStatsSpec extends TestEnvironment with TestData { assert(dims.as[(Int, Int)].first() === (3, 3)) assert(dims.schema.head.name === "cols") - val query = sql( - """|select dims.* from ( + val query = sql("""|select dims.* from ( |select rf_tile_dimensions(tiles) as dims from ( |select rf_make_constant_tile(1, 10, 10, 'int8raw') as tiles)) |""".stripMargin) @@ -57,18 +56,19 @@ class TileStatsSpec extends TestEnvironment with TestData { assert(query.as[(Int, Int)].first() === (10, 10)) df.repartition(4).createOrReplaceTempView("tmp") - assert(sql("select dims.* from (select rf_tile_dimensions(tile2) as dims from tmp)") - .as[(Int, Int)].first() === (3, 3)) + assert( + sql("select dims.* from (select rf_tile_dimensions(tile2) as dims from tmp)") + .as[(Int, Int)] + .first() === (3, 3)) } it("should report cell type") { - import sqlContext.implicits._ val ct = functions.cellTypes().filter(_ != "bool") - forEvery(ct) { c ⇒ + forEvery(ct) { c => val expected = CellType.fromName(c) val tile = randomTile(5, 5, expected) val result = Seq(tile).toDF("tile").select(cell_type($"tile")).first() - result should be (expected) + result should be(expected) } } @@ -78,15 +78,15 @@ class TileStatsSpec extends TestEnvironment with TestData { val tile3 = randomTile(255, 255, IntCellType) it("should compute accurate item counts") { - import sqlContext.implicits._ val ds = Seq[Tile](tile1, tile2, tile3).toDF("tiles") val checkedValues = Seq[Double](0, 4, 7, 13, 26) val result = checkedValues.map(x => ds.select(tile_histogram($"tiles")).first().itemCount(x)) - forEvery(checkedValues) { x => assert((x == 0 && result.head == 4) || result.contains(x - 1)) } + forEvery(checkedValues) { x => + assert((x == 0 && result.head == 4) || result.contains(x - 1)) + } } it("Should compute quantiles") { - import sqlContext.implicits._ val ds = Seq[Tile](tile1, tile2, tile3).toDF("tiles") val numBreaks = 5 val breaks = ds.select(tile_histogram($"tiles")).map(_.quantileBreaks(numBreaks)).collect() @@ -101,51 +101,27 @@ class TileStatsSpec extends TestEnvironment with TestData { ds.createOrReplaceTempView("tmp") withClue("max") { - val max = ds.agg(local_agg_max($"tiles")) + val max = ds.agg(agg_local_max($"tiles")) val expected = Max(byteArrayTile, byteConstantTile) write(max) assert(max.as[Tile].first() === expected) - val sqlMax = sql("select rf_local_agg_max(tiles) from tmp") + val sqlMax = sql("select rf_agg_local_max(tiles) from tmp") assert(sqlMax.as[Tile].first() === expected) } withClue("min") { - val min = ds.agg(local_agg_min($"tiles")) + val min = ds.agg(agg_local_min($"tiles")) val expected = Min(byteArrayTile, byteConstantTile) write(min) assert(min.as[Tile].first() === Min(byteArrayTile, byteConstantTile)) - val sqlMin = sql("select rf_local_agg_min(tiles) from tmp") + val sqlMin = sql("select rf_agg_local_min(tiles) from tmp") assert(sqlMin.as[Tile].first() === expected) } } - it("should count data and no-data cells") { - import sqlContext.implicits._ - val ds = (Seq.fill[Tile](10)(injectND(10)(randomTile(10, 10, UByteConstantNoDataCellType))) :+ null).toDF("tile") - val expectedNoData = 10 * 10 - val expectedData = 10 * 10 * 10 - expectedNoData - - //logger.debug(ds.select($"tile").as[Tile].first.cell_type.name) - - assert(ds.select(data_cells($"tile") as "cells").agg(sum("cells")).as[Long].first() === expectedData) - assert(ds.select(no_data_cells($"tile") as "cells").agg(sum("cells")).as[Long].first() === expectedNoData) - - assert(ds.select(agg_data_cells($"tile")).first() === expectedData) - assert(ds.select(agg_no_data_cells($"tile")).first() === expectedNoData) - - val resultTileStats = ds.select(tile_stats($"tile")("dataCells") as "cells") - .agg(sum("cells")).as[Long] - .first() - assert(resultTileStats === expectedData) - - val (aggDC, aggNDC) = ds.select(agg_stats($"tile")).select("dataCells", "noDataCells").as[(Long, Long)].first() - assert(aggDC === expectedData) - assert(aggNDC === expectedNoData) - } - it("should compute tile statistics") { import sqlContext.implicits._ withClue("mean") { @@ -154,10 +130,11 @@ class TileStatsSpec extends TestEnvironment with TestData { val means1 = ds.select(tile_stats($"value")).map(_.mean).collect val means2 = ds.select(tile_mean($"value")).collect // Compute the mean manually, knowing we're not dealing with no-data values. - val means = ds.select(tile_to_array[Float]($"value")).map(a ⇒ a.sum.toDouble / a.length).collect + val means = + ds.select(tile_to_array_double($"value")).map(a => a.sum / a.length).collect - forAll(means.zip(means1)) { case (l, r) ⇒ assert(l === r +- 1e-6) } - forAll(means.zip(means2)) { case (l, r) ⇒ assert(l === r +- 1e-6) } + forAll(means.zip(means1)) { case (l, r) => assert(l === r +- 1e-6) } + forAll(means.zip(means2)) { case (l, r) => assert(l === r +- 1e-6) } } withClue("sum") { val rf = l8Sample(1).projectedRaster.toRF @@ -169,17 +146,15 @@ class TileStatsSpec extends TestEnvironment with TestData { } it("should compute per-tile histogram") { - import sqlContext.implicits._ val ds = Seq.fill[Tile](3)(randomTile(5, 5, FloatCellType)).toDF("tiles") ds.createOrReplaceTempView("tmp") val r1 = ds.select(tile_histogram($"tiles")) assert(r1.first.totalCount === 5 * 5) write(r1) - val r2 = sql("select hist.* from (select rf_tile_histogram(tiles) as hist from tmp)").as[CellHistogram] write(r2) - assert(r1.first.mean === r2.first.mean) + assert(r1.first === r2.first) } it("should compute mean and total count") { @@ -198,33 +173,30 @@ class TileStatsSpec extends TestEnvironment with TestData { } it("should compute aggregate histogram") { - import sqlContext.implicits._ val tileSize = 5 val rows = 10 - val ds = Seq.fill[Tile](rows)(randomTile(tileSize, tileSize, FloatConstantNoDataCellType)).toDF("tiles") + val ds = Seq + .fill[Tile](rows)(randomTile(tileSize, tileSize, FloatConstantNoDataCellType)) + .toDF("tiles") ds.createOrReplaceTempView("tmp") - val agg = ds.select(agg_histogram($"tiles")).as[CellHistogram] + val agg = ds.select(agg_approx_histogram($"tiles")) + val histArray = agg.collect() - assert(histArray.length === 1) + histArray.length should be (1) // examine histogram info val hist = histArray.head - //logger.info(hist.asciiHistogram(128)) - //logger.info(hist.asciiStats) assert(hist.totalCount === rows * tileSize * tileSize) assert(hist.bins.map(_.count).sum === rows * tileSize * tileSize) - val stats = agg.map(_.stats).as("stats") - //stats.select("stats.*").show(false) - assert(stats.first().stddev === 1.0 +- 0.3) // <-- playing with statistical fire :) + val hist2 = sql("select hist.* from (select rf_agg_approx_histogram(tiles) as hist from tmp)").as[CellHistogram] - val hist2 = sql("select hist.* from (select rf_agg_histogram(tiles) as hist from tmp)").as[CellHistogram] + hist2.first.totalCount should be (rows * tileSize * tileSize) - assert(hist2.first.totalCount === rows * tileSize * tileSize) + checkDocs("rf_agg_approx_histogram") } it("should compute aggregate mean") { - import sqlContext.implicits._ val ds = (Seq.fill[Tile](10)(randomTile(5, 5, FloatCellType)) :+ null).toDF("tiles") val agg = ds.select(agg_mean($"tiles")) val stats = ds.select(agg_stats($"tiles") as "stats").select($"stats.mean".as[Double]) @@ -232,14 +204,13 @@ class TileStatsSpec extends TestEnvironment with TestData { } it("should compute aggregate statistics") { - import sqlContext.implicits._ val ds = Seq.fill[Tile](10)(randomTile(5, 5, FloatConstantNoDataCellType)).toDF("tiles") val exploded = ds.select(explode_tiles($"tiles")) val (mean, vrnc) = exploded.agg(avg($"tiles"), var_pop($"tiles")).as[(Double, Double)].first val stats = ds.select(agg_stats($"tiles") as "stats") ///.as[(Long, Double, Double, Double, Double)] -stats.printSchema() + //stats.printSchema() noException shouldBe thrownBy { ds.select(agg_stats($"tiles")).collect() } @@ -250,7 +221,7 @@ stats.printSchema() ds.createOrReplaceTempView("tmp") val agg2 = sql("select stats.* from (select rf_agg_stats(tiles) as stats from tmp)") - assert(agg2.first().getAs[Long]("dataCells") === 250L) + assert(agg2.first().getAs[Long]("data_cells") === 250L) val agg3 = ds.agg(agg_stats($"tiles") as "stats").select($"stats.mean".as[Double]) assert(mean === agg3.first()) @@ -258,13 +229,14 @@ stats.printSchema() it("should compute aggregate local stats") { import sqlContext.implicits._ - val ave = (nums: Array[Double]) ⇒ nums.sum / nums.length + val ave = (nums: Array[Double]) => nums.sum / nums.length - val ds = (Seq.fill[Tile](30)(randomTile(5, 5, FloatConstantNoDataCellType)) + val ds = (Seq + .fill[Tile](30)(randomTile(5, 5, FloatConstantNoDataCellType)) .map(injectND(2)) :+ null).toDF("tiles") ds.createOrReplaceTempView("tmp") - val agg = ds.select(local_agg_stats($"tiles") as "stats") + val agg = ds.select(agg_local_stats($"tiles") as "stats") val stats = agg.select("stats.*") //printStatsRows(stats) @@ -273,23 +245,23 @@ stats.printSchema() assert(min < -2.0) val max = agg.select($"stats.max".as[Tile]).map(_.toArrayDouble().max).first assert(max > 2.0) - val tendancy = agg.select($"stats.mean".as[Tile]).map(t ⇒ ave(t.toArrayDouble())).first + val tendancy = agg.select($"stats.mean".as[Tile]).map(t => ave(t.toArrayDouble())).first assert(tendancy < 0.2) - val varg = agg.select($"stats.mean".as[Tile]).map(t ⇒ ave(t.toArrayDouble())).first + val varg = agg.select($"stats.mean".as[Tile]).map(t => ave(t.toArrayDouble())).first assert(varg < 1.1) - val sqlStats = sql("SELECT stats.* from (SELECT rf_local_agg_stats(tiles) as stats from tmp)") + val sqlStats = sql("SELECT stats.* from (SELECT rf_agg_local_stats(tiles) as stats from tmp)") val tiles = stats.collect().flatMap(_.toSeq).map(_.asInstanceOf[Tile]) val dsTiles = sqlStats.collect().flatMap(_.toSeq).map(_.asInstanceOf[Tile]) - forEvery(tiles.zip(dsTiles)) { case (t1, t2) ⇒ - assert(t1 === t2) + forEvery(tiles.zip(dsTiles)) { + case (t1, t2) => + assert(t1 === t2) } } it("should compute accurate statistics") { - import sqlContext.implicits._ val completeTile = squareIncrementingTile(4).convert(IntConstantNoDataCellType) val incompleteTile = injectND(2)(completeTile) @@ -297,50 +269,50 @@ stats.printSchema() val dsNd = (Seq.fill(20)(completeTile) :+ incompleteTile :+ null).toDF("tiles") // counted everything properly - val countTile = ds.select(local_agg_data_cells($"tiles")).first() - forAll(countTile.toArray())(i ⇒ assert(i === 20)) + val countTile = ds.select(agg_local_data_cells($"tiles")).first() + forAll(countTile.toArray())(i => assert(i === 20)) - val countArray = dsNd.select(local_agg_data_cells($"tiles")).first().toArray() - val expectedCount = (completeTile.localDefined().toArray zip incompleteTile.localDefined().toArray()) - .toSeq.map(pr ⇒ pr._1 * 20 + pr._2) + val countArray = dsNd.select(agg_local_data_cells($"tiles")).first().toArray() + val expectedCount = + (completeTile.localDefined().toArray zip incompleteTile.localDefined().toArray()).toSeq.map( + pr => pr._1 * 20 + pr._2) assert(countArray === expectedCount) - val countNodataArray = dsNd.select(local_agg_no_data_cells($"tiles")).first().toArray + val countNodataArray = dsNd.select(agg_local_no_data_cells($"tiles")).first().toArray assert(countNodataArray === incompleteTile.localUndefined().toArray) - // GeoTrellis docs do not say how NODATA is treated, but NODATA values are ignored - val meanTile = dsNd.select(local_agg_mean($"tiles")).first() - assert(meanTile.toArray() === completeTile.toArray()) + val minTile = dsNd.select(agg_local_min($"tiles")).first() + assert(minTile.toArray() === completeTile.toArray()) - // GeoTrellis docs state that Min(1.0, NODATA) = NODATA - val minTile = dsNd.select(local_agg_min($"tiles")).first() - assert(minTile.toArray() === incompleteTile.toArray()) + val maxTile = dsNd.select(agg_local_max($"tiles")).first() + assert(maxTile.toArray() === completeTile.toArray()) - // GeoTrellis docs state that Max(1.0, NODATA) = NODATA - val maxTile = dsNd.select(local_agg_max($"tiles")).first() - assert(maxTile.toArray() === incompleteTile.toArray()) + val meanTile = dsNd.select(agg_local_mean($"tiles")).first() + assert(meanTile.toArray() === completeTile.toArray()) } } describe("NoData handling") { - import sqlContext.implicits._ val tsize = 5 val count = 20 val nds = 2 - val tiles = (Seq.fill[Tile](count)(randomTile(tsize, tsize, UByteUserDefinedNoDataCellType(255.toByte))) + val tiles = (Seq + .fill[Tile](count)(randomTile(tsize, tsize, UByteUserDefinedNoDataCellType(255.toByte))) .map(injectND(nds)) :+ null).toDF("tiles") it("should count cells by NoData state") { val counts = tiles.select(no_data_cells($"tiles")).collect().dropRight(1) - forEvery(counts)(c ⇒ assert(c === nds)) + forEvery(counts)(c => assert(c === nds)) val counts2 = tiles.select(data_cells($"tiles")).collect().dropRight(1) - forEvery(counts2)(c ⇒ assert(c === tsize * tsize - nds)) + forEvery(counts2)(c => assert(c === tsize * tsize - nds)) } it("should detect all NoData tiles") { val ndCount = tiles.select("*").where(is_no_data_tile($"tiles")).count() ndCount should be(1) - val ndTiles = (Seq.fill[Tile](count)(ArrayTile.empty(UByteConstantNoDataCellType, tsize, tsize)) :+ null).toDF("tiles") + val ndTiles = + (Seq.fill[Tile](count)(ArrayTile.empty(UByteConstantNoDataCellType, tsize, tsize)) :+ null) + .toDF("tiles") val ndCount2 = ndTiles.select("*").where(is_no_data_tile($"tiles")).count() ndCount2 should be(count + 1) } diff --git a/core/src/test/scala/astraea/spark/rasterframes/TileUDTSpec.scala b/core/src/test/scala/astraea/spark/rasterframes/TileUDTSpec.scala index 75cb2879f..b83b94486 100644 --- a/core/src/test/scala/astraea/spark/rasterframes/TileUDTSpec.scala +++ b/core/src/test/scala/astraea/spark/rasterframes/TileUDTSpec.scala @@ -39,6 +39,7 @@ class TileUDTSpec extends TestEnvironment with TestData with Inspectors { spark.version val tileEncoder: ExpressionEncoder[Tile] = ExpressionEncoder() val TileType = new TileUDT() + implicit val ser = TileUDT.tileSerializer describe("TileUDT") { val tileSizes = Seq(2, 64, 128, 222, 511) diff --git a/core/src/test/scala/astraea/spark/rasterframes/encoders/CatalystSerializerSpec.scala b/core/src/test/scala/astraea/spark/rasterframes/encoders/CatalystSerializerSpec.scala index 614510c00..c489b8d7b 100644 --- a/core/src/test/scala/astraea/spark/rasterframes/encoders/CatalystSerializerSpec.scala +++ b/core/src/test/scala/astraea/spark/rasterframes/encoders/CatalystSerializerSpec.scala @@ -20,30 +20,138 @@ */ package astraea.spark.rasterframes.encoders +import java.time.ZonedDateTime + +import astraea.spark.rasterframes.encoders.StandardEncoders._ +import astraea.spark.rasterframes.model.{CellContext, TileContext, TileDataContext, TileDimensions} +import astraea.spark.rasterframes.ref.{RasterRef, RasterSource} import astraea.spark.rasterframes.{TestData, TestEnvironment} import geotrellis.proj4._ +import geotrellis.raster.{CellSize, CellType, TileLayout, UShortUserDefinedNoDataCellType} +import geotrellis.spark.tiling.LayoutDefinition +import geotrellis.spark.{Bounds, KeyBounds, SpaceTimeKey, SpatialKey, TileLayerMetadata} +import geotrellis.vector.{Extent, ProjectedExtent} import org.apache.spark.sql.catalyst.encoders.ExpressionEncoder +import org.scalatest.Assertion class CatalystSerializerSpec extends TestEnvironment with TestData { + val dc = TileDataContext(UShortUserDefinedNoDataCellType(3), TileDimensions(12, 23)) + val tc = TileContext(Extent(1, 2, 3, 4), WebMercator) + val cc = CellContext(tc, dc, 34, 45) + val ext = Extent(1.2, 2.3, 3.4, 4.5) + val tl = TileLayout(10, 10, 20, 20) + val ct: CellType = UShortUserDefinedNoDataCellType(5.toShort) + val ld = LayoutDefinition(ext, tl) + val skb = KeyBounds[SpatialKey](SpatialKey(1, 2), SpatialKey(3, 4)) + + + def assertSerializerMatchesEncoder[T: CatalystSerializer: ExpressionEncoder](value: T): Assertion = { + val enc = implicitly[ExpressionEncoder[T]] + val ser = CatalystSerializer[T] + ser.schema should be (enc.schema) + } + def assertConsistent[T: CatalystSerializer](value: T): Assertion = { + val ser = CatalystSerializer[T] + ser.toRow(value) should be(ser.toRow(value)) + } + def assertInvertable[T: CatalystSerializer](value: T): Assertion = { + val ser = CatalystSerializer[T] + ser.fromRow(ser.toRow(value)) should be(value) + } + + def assertContract[T: CatalystSerializer: ExpressionEncoder](value: T): Assertion = { + assertConsistent(value) + assertInvertable(value) + assertSerializerMatchesEncoder(value) + } - import sqlContext.implicits._ describe("Specialized serialization on specific types") { - it("should support encoding") { - import sqlContext.implicits._ +// it("should support encoding") { +// implicit val enc: ExpressionEncoder[CRS] = CatalystSerializerEncoder[CRS]() +// +// //println(enc.deserializer.genCode(new CodegenContext)) +// val values = Seq[CRS](LatLng, Sinusoidal, ConusAlbers, WebMercator) +// val df = spark.createDataset(values)(enc) +// //df.show(false) +// val results = df.collect() +// results should contain allElementsOf values +// } - implicit val enc: ExpressionEncoder[CRS] = CatalystSerializerEncoder[CRS] + it("should serialize CRS") { + val v: CRS = LatLng + assertContract(v) + } - val values = Seq[CRS](LatLng, Sinusoidal, ConusAlbers, WebMercator) - val df = spark.createDataset(values)(enc) - //df.show(false) - val results = df.collect() - results should contain allElementsOf values + it("should serialize TileDataContext") { + assertContract(dc) } - it("should serialize CRS") { - val ser = CatalystSerializer[CRS] - ser.fromRow(ser.toRow(LatLng)) should be(LatLng) - ser.fromRow(ser.toRow(Sinusoidal)) should be(Sinusoidal) + it("should serialize TileContext") { + assertContract(tc) + } + + it("should serialize CellContext") { + assertContract(cc) + } + + it("should serialize ProjectedRasterTile") { + // TODO: Decide if ProjectedRasterTile should be encoded 'flat', non-'flat', or depends + val value = TestData.projectedRasterTile(20, 30, -1.2, extent) + assertConsistent(value) + assertInvertable(value) + } + + it("should serialize RasterRef") { + val src = RasterSource(remoteCOGSingleband1) + val value = RasterRef(src, Some(src.extent.buffer(-3.0))) + assertConsistent(value) + assertInvertable(value) + } + + it("should serialize CellType") { + assertContract(ct) + } + + it("should serialize Extent") { + assertContract(ext) + } + + it("should eserialize ProjectedExtent") { + val pe = ProjectedExtent(ext, ConusAlbers) + assertContract(pe) + } + + it("should eserialize SpatialKey") { + val v = SpatialKey(2, 3) + assertContract(v) + } + + it("should eserialize SpaceTimeKey") { + val v = SpaceTimeKey(2, 3, ZonedDateTime.now()) + assertContract(v) + } + + it("should serialize CellSize") { + val v = CellSize(extent, 50, 60) + assertContract(v) + } + + it("should serialize TileLayout") { + assertContract(tl) + } + + it("should serialize LayoutDefinition") { + assertContract(ld) + } + + it("should serialize Bounds[SpatialKey]") { + implicit val skbEnc = ExpressionEncoder[KeyBounds[SpatialKey]]() + assertContract(skb) + } + + it("should serialize TileLayerMetata[SpatialKey]") { + val tlm = TileLayerMetadata(ct, ld, ext, ConusAlbers, skb) + assertContract(tlm) } } } diff --git a/core/src/test/scala/astraea/spark/rasterframes/encoders/EncodingSpec.scala b/core/src/test/scala/astraea/spark/rasterframes/encoders/EncodingSpec.scala index dd21b8ce1..a0c0bad0e 100644 --- a/core/src/test/scala/astraea/spark/rasterframes/encoders/EncodingSpec.scala +++ b/core/src/test/scala/astraea/spark/rasterframes/encoders/EncodingSpec.scala @@ -25,6 +25,7 @@ import java.io.File import java.net.URI import astraea.spark.rasterframes._ +import astraea.spark.rasterframes.tiles.ProjectedRasterTile import com.vividsolutions.jts.geom.Envelope import geotrellis.proj4._ import geotrellis.raster.{CellType, Tile, TileFeature} @@ -32,6 +33,7 @@ import geotrellis.spark.{SpaceTimeKey, SpatialKey, TemporalProjectedExtent, Tile import geotrellis.vector.{Extent, ProjectedExtent} import org.apache.spark.sql.Row import org.apache.spark.sql.functions._ +import org.apache.spark.sql.rf.TileUDT /** * Test rig for encoding GT types into Catalyst types. @@ -44,10 +46,14 @@ class EncodingSpec extends TestEnvironment with TestData { describe("Spark encoding on standard types") { - it("should code RDD[(Int, Tile)]") { - val ds = Seq((1, byteArrayTile: Tile), (2, null)).toDS - write(ds) - assert(ds.toDF.as[(Int, Tile)].collect().head === ((1, byteArrayTile))) + it("should serialize Tile") { + val TileType = new TileUDT() + + forAll(allTileTypes) { t => + noException shouldBe thrownBy { + TileType.deserialize(TileType.serialize(t)) + } + } } it("should code RDD[Tile]") { @@ -57,6 +63,12 @@ class EncodingSpec extends TestEnvironment with TestData { assert(ds.toDF.as[Tile].collect().head === byteArrayTile) } + it("should code RDD[(Int, Tile)]") { + val ds = Seq((1, byteArrayTile: Tile), (2, null)).toDS + write(ds) + assert(ds.toDF.as[(Int, Tile)].collect().head === ((1, byteArrayTile))) + } + it("should code RDD[TileFeature]") { val thing = TileFeature(byteArrayTile: Tile, "meta") val ds = Seq(thing).toDS() @@ -64,6 +76,13 @@ class EncodingSpec extends TestEnvironment with TestData { assert(ds.toDF.as[TileFeature[Tile, String]].collect().head === thing) } + it("should code RDD[ProjectedRasterTile]") { + val tile = TestData.projectedRasterTile(20, 30, -1.2, extent) + val ds = Seq(tile).toDS() + write(ds) + assert(ds.toDF.as[ProjectedRasterTile].collect().head === tile) + } + it("should code RDD[Extent]") { val ds = Seq(extent).toDS() write(ds) @@ -85,8 +104,6 @@ class EncodingSpec extends TestEnvironment with TestData { it("should code RDD[CellType]") { val ct = CellType.fromName("uint8") val ds = Seq(ct).toDS() - //ds.printSchema() - //ds.show(false) write(ds) assert(ds.toDF.as[CellType].first() === ct) } @@ -105,7 +122,7 @@ class EncodingSpec extends TestEnvironment with TestData { assert(ds.toDF.as[(SpatialKey, SpaceTimeKey)].first === (sk, stk)) // This stinks: vvvvvvvv Encoders don't seem to work with UDFs. - val key2col = udf((row: Row) ⇒ row.getInt(0)) + val key2col = udf((row: Row) => row.getInt(0)) val colNum = ds.select(key2col(ds(ds.columns.head))).as[Int].first() assert(colNum === 37) @@ -118,13 +135,13 @@ class EncodingSpec extends TestEnvironment with TestData { val results = ds.toDF.as[CRS].collect() - results should contain allElementsOf (values) + results should contain allElementsOf values } it("should code RDD[URI]") { val ds = Seq[URI](new URI("http://astraea.earth/"), new File("/tmp/humbug").toURI).toDS() write(ds) - assert(ds.filter(u ⇒ Option(u.getHost).exists(_.contains("astraea"))).count === 1) + assert(ds.filter(u => Option(u.getHost).exists(_.contains("astraea"))).count === 1) } it("should code RDD[Envelope]") { @@ -135,6 +152,4 @@ class EncodingSpec extends TestEnvironment with TestData { } } - } - diff --git a/core/src/test/scala/astraea/spark/rasterframes/ref/RasterRefSpec.scala b/core/src/test/scala/astraea/spark/rasterframes/ref/RasterRefSpec.scala index c38f59cc8..4efe2b474 100644 --- a/core/src/test/scala/astraea/spark/rasterframes/ref/RasterRefSpec.scala +++ b/core/src/test/scala/astraea/spark/rasterframes/ref/RasterRefSpec.scala @@ -23,7 +23,8 @@ package astraea.spark.rasterframes.ref import astraea.spark.rasterframes.TestEnvironment.ReadMonitor import astraea.spark.rasterframes._ -import astraea.spark.rasterframes.expressions._ +import astraea.spark.rasterframes.expressions.transformers._ +import astraea.spark.rasterframes.expressions.accessors._ import astraea.spark.rasterframes.ref.RasterRef.RasterRefTile import geotrellis.raster.Tile import geotrellis.vector.Extent @@ -184,7 +185,7 @@ class RasterRefSpec extends TestEnvironment with TestData { new Fixture { import spark.implicits._ val df = Seq(src).toDF("src") - val refs = df.select(RasterSourceToRasterRefs($"src")) + val refs = df.select(RasterSourceToRasterRefs(true, $"src")) assert(refs.count() > 1) } } diff --git a/core/src/test/scala/astraea/spark/rasterframes/ref/RasterSourceSpec.scala b/core/src/test/scala/astraea/spark/rasterframes/ref/RasterSourceSpec.scala index f1bfc2dec..1c1fb182a 100644 --- a/core/src/test/scala/astraea/spark/rasterframes/ref/RasterSourceSpec.scala +++ b/core/src/test/scala/astraea/spark/rasterframes/ref/RasterSourceSpec.scala @@ -163,27 +163,4 @@ class RasterSourceSpec extends TestEnvironment with TestData { } } } - - describe("RasterSource.readAll") { - it("should return consistently ordered tiles across bands for a given scene") { - - // These specific scenes exhibit the problem where - // we see different subtile segment ordering across - // the bands of a given scene. - val rURI = new URI("https://s3-us-west-2.amazonaws.com/landsat-pds/c1/L8/016/034/LC08_L1TP_016034_20181003_20181003_01_RT/LC08_L1TP_016034_20181003_20181003_01_RT_B4.TIF") - val bURI = new URI("https://s3-us-west-2.amazonaws.com/landsat-pds/c1/L8/016/034/LC08_L1TP_016034_20181003_20181003_01_RT/LC08_L1TP_016034_20181003_20181003_01_RT_B2.TIF") - //val gURI = new URI("https://s3-us-west-2.amazonaws.com/landsat-pds/c1/L8/016/034/LC08_L1TP_016034_20181003_20181003_01_RT/LC08_L1TP_016034_20181003_20181003_01_RT_B3.TIF") - - val red = RasterSource(rURI).readAll().left.get - val blue = RasterSource(bURI).readAll().left.get - //val green = RasterSource(gURI).readAll().left.get - - red should not be empty - red.size should equal(blue.size) - //red.size should equal(green.size) - - red.map(_.dimensions) should contain theSameElementsAs blue.map(_.dimensions) - //red.map(_.dimensions) should contain theSameElementsInOrderAs green.map(_.dimensions) - } - } } diff --git a/core/src/test/scala/examples/Exporting.scala b/core/src/test/scala/examples/Exporting.scala index 5dca99397..247e93944 100644 --- a/core/src/test/scala/examples/Exporting.scala +++ b/core/src/test/scala/examples/Exporting.scala @@ -54,7 +54,7 @@ object Exporting extends App { // The @scaladoc[`tile_to_array`][tile_to_array] column function requires a type parameter to indicate the array element // type you would like used. The following types may be used: `Int`, `Double`, `Byte`, `Short`, `Float` - val withArrays = rf.withColumn("tileData", tile_to_array[Short]($"tile")).drop("tile") + val withArrays = rf.withColumn("tileData", tile_to_array_int($"tile")).drop("tile") withArrays.show(5, 40) // You can convert the data back to an array, but you have to specify the target tile dimensions. diff --git a/deployment/README.md b/deployment/README.md index d9ce541da..5e008b8a1 100644 --- a/deployment/README.md +++ b/deployment/README.md @@ -56,7 +56,7 @@ To build the Docker image based on local development changes: ```bash # from the root of the repo -sbt deployment/rfNotebookContainer +sbt deployment/rfDocker ``` ## Base images diff --git a/deployment/build.sbt b/deployment/build.sbt index 66cc330ae..c76ef554b 100644 --- a/deployment/build.sbt +++ b/deployment/build.sbt @@ -11,13 +11,13 @@ val Python = config("python") lazy val rfDockerImageName = settingKey[String]("Name to tag Docker image with.") rfDockerImageName := "s22s/rasterframes-notebooks" -lazy val rfNotebookContainer = taskKey[Unit]("Build Jupyter Notebook Docker image with RasterFrames support.") -rfNotebookContainer := (Docker / packageBin).value +lazy val rfDocker = taskKey[Unit]("Build Jupyter Notebook Docker image with RasterFrames support.") +rfDocker := (Docker / packageBin).value lazy val runRFNotebook = taskKey[String]("Run RasterFrames Jupyter Notebook image") runRFNotebook := { val imageName = rfDockerImageName.value - val _ = rfNotebookContainer.value + val _ = rfDocker.value Process(s"docker run -p 8888:8888 -p 4040:4040 $imageName").run() imageName } diff --git a/docs/src/main/tut/apps/geotrellis-ops.md b/docs/src/main/tut/apps/geotrellis-ops.md index 4ea3efb08..81a97a3fa 100644 --- a/docs/src/main/tut/apps/geotrellis-ops.md +++ b/docs/src/main/tut/apps/geotrellis-ops.md @@ -34,7 +34,7 @@ Here's an example downsampling a tile and rendering each tile as a matrix of num ```tut val downsample = udf((t: Tile) => t.resample(4, 4)) val downsampled = rf.where(no_data_cells($"tile") === 0).select(downsample($"tile") as "minime") -downsampled.select(tile_to_array[Float]($"minime") as "cell_values").limit(2).show(false) +downsampled.select(tile_to_array_double($"minime") as "cell_values").limit(2).show(false) ``` diff --git a/docs/src/main/tut/exporting-rasterframes.md b/docs/src/main/tut/exporting-rasterframes.md index def137fc8..2015943f5 100644 --- a/docs/src/main/tut/exporting-rasterframes.md +++ b/docs/src/main/tut/exporting-rasterframes.md @@ -27,11 +27,11 @@ The cell values within a `Tile` are encoded internally as an array. There may be where the additional context provided by the `Tile` construct is no longer needed and one would prefer to work with the underlying array data. -The @scaladoc[`tileToArray`][tileToArray] column function requires a type parameter to indicate the array element +The @scaladoc[`tile_to_array_int`][tile_to_array_int] column function requires a type parameter to indicate the array element type you would like used. The following types may be used: `Int`, `Double`, `Byte`, `Short`, `Float` ```tut -val withArrays = rf.withColumn("tileData", tileToArray[Short]($"tile")).drop("tile") +val withArrays = rf.withColumn("tileData", tile_to_array_int($"tile")).drop("tile") withArrays.show(5, 40) ``` @@ -186,5 +186,5 @@ spark.stop() [rfInit]: astraea.spark.rasterframes.package#rfInit%28SQLContext%29:Unit [rdd]: org.apache.spark.sql.Dataset#frdd:org.apache.spark.rdd.RDD[T] [toTileLayerRDD]: astraea.spark.rasterframes.RasterFrameMethods#toTileLayerRDD%28tileCol:RasterFrameMethods.this.TileColumn%29:Either[geotrellis.spark.TileLayerRDD[geotrellis.spark.SpatialKey],geotrellis.spark.TileLayerRDD[geotrellis.spark.SpaceTimeKey]] -[tileToArray]: astraea.spark.rasterframes.ColumnFunctions#tileToArray +[tile_to_array_int]: astraea.spark.rasterframes.ColumnFunctions#tile_to_array_int diff --git a/docs/src/main/tut/ml/statistics.md b/docs/src/main/tut/ml/statistics.md index 6b0328146..3ff086ad1 100644 --- a/docs/src/main/tut/ml/statistics.md +++ b/docs/src/main/tut/ml/statistics.md @@ -81,7 +81,7 @@ rf.select(agg_stats($"tile")).show() A more involved example: extract bin counts from a computed `Histogram`. ```tut -rf.select(agg_histogram($"tile")). +rf.select(agg_approx_histogram($"tile")). map(h => for(v <- h.labels) yield(v, h.itemCount(v))). select(explode($"value") as "counts"). select("counts._1", "counts._2"). diff --git a/docs/src/main/tut/pyrasterframes.md b/docs/src/main/tut/pyrasterframes.md index 027746c10..35a5169d5 100644 --- a/docs/src/main/tut/pyrasterframes.md +++ b/docs/src/main/tut/pyrasterframes.md @@ -7,9 +7,8 @@ in the meantime: * [PyRasterFrames README](https://github.com/locationtech/rasterframes/blob/develop/pyrasterframes/python/README.rst) * [PyRasterFrames Examples](https://github.com/locationtech/rasterframes/tree/develop/pyrasterframes/python/examples) * [RasterFrames Jupyter Notebook](https://github.com/locationtech/rasterframes/blob/develop/deployment/README.md) +* @ref:[PyRasterFrames Functions](reference.md) -Most features available in the Scala API are exposed in the Python API, and take almost the same form as they -do in Scala. Python UDFs on `Tile` are not yet supported. +Most features available in the Scala API are exposed in the Python API, refer to the @ref:[function reference](reference.md). Defining a [udf](http://spark.apache.org/docs/latest/api/python/pyspark.sql.html#pyspark.sql.functions.udf) using a `Tile` column through the Python API is not yet supported. -If there's a specific feature that appears to be missing in the Python version [please submit an issue](https://github.com/locationtech/rasterframes/issues) -so that we might address it for you. \ No newline at end of file +If there's a specific feature that appears to be missing in the Python version [please submit an issue](https://github.com/locationtech/rasterframes/issues) so that we might address it for you. diff --git a/docs/src/main/tut/reference.md b/docs/src/main/tut/reference.md index 704fbe367..d264d900d 100644 --- a/docs/src/main/tut/reference.md +++ b/docs/src/main/tut/reference.md @@ -1,8 +1,946 @@ -# Reference +# Function Reference + +For the most up to date list of User Defined Functions using Tiles, look at API documentation for @scaladoc[`RasterFunctions`][RasterFunctions]. + +The full Scala API documentation can be found [here][scaladoc]. + +RasterFrames also provides SQL and Python bindings to many UDFs using the `Tile` column type. In Spark SQL, the functions are already registered in the SQL engine; they are usually prefixed with `rf_`. In Python, they are available in the `pyrasterframes.rasterfunctions` module. + +The convention in this document will be to define the function signature as below, with its return type, the function name, and named arguments with their types. + +``` +ReturnDataType function_name(InputDataType argument1, InputDataType argument2) +``` + +## List of Available SQL and Python Functions + +The convention in this document will be to define the function signature as below, with its return type, the function name, and named arguments with their types. + +``` +ReturnDataType function_name(InputDataType argument1, InputDataType argument2) +``` + +@@toc { depth=3 } + +### Vector Operations + +Various LocationTech GeoMesa UDFs to deal with `geomtery` type columns are also provided in the SQL engine and within the `pyrasterframes.rasterfunctions` Python module. These are documented in the [LocationTech GeoMesa Spark SQL documentation](https://www.geomesa.org/documentation/user/spark/sparksql_functions.html#). These functions are all prefixed with `st_`. + +RasterFrames provides two additional functions for vector geometry. + +#### reproject_geometry + +_Python_: + Geometry reproject_geometry(Geometry geom, String origin_crs, String destination_crs) + +_SQL_: `rf_reproject_geometry` + +Reproject the vector `geom` from `origin_crs` to `destination_crs`. Both `_crs` arguments are either [proj4](https://proj4.org/usage/quickstart.html) strings, [EPSG codes](https://www.epsg-registry.org/) codes or [OGC WKT](https://www.opengeospatial.org/standards/wkt-crs) for coordinate reference systems. + + +#### envelope + +_Python_: + + Struct[Double minX, Double maxX, Double minY, Double maxY] envelope(Geometry geom) + +Python only. Extracts the bounding box (envelope) of the geometry. + +See also GeoMesa [st_envelope](https://www.geomesa.org/documentation/user/spark/sparksql_functions.html#st-envelope) which returns a Geometry type. + +### Tile Metadata and Mutation + +Functions to access and change the particulars of a `tile`: its shape and the data type of its cells. See below section on @ref:[masking and nodata](reference.md#masking-and-nodata) for additional discussion of cell types. + +#### cell_types + + +_Python_: + + Array[String] cell_types() + +_SQL_: `rf_cell_types` + +Print an array of possible cell type names, as below. These names are used in other functions. See @ref:[discussion on nodata](reference.md#masking-and-nodata) for additional details. + +|cell_types | +|----------| +|bool | +|int8raw | +|int8 | +|uint8raw | +|uint8 | +|int16raw | +|int16 | +|uint16raw | +|uint16 | +|int32raw | +|int32 | +|float32raw| +|float32 | +|float64raw| +|float64 | + + +#### tile_dimensions + +_Python_: + + Struct[Int, Int] tile_dimensions(Tile tile) + +_SQL_: `rf_tile_dimensions` + +Get number of columns and rows in the `tile`, as a Struct of `cols` and `rows`. + +#### cell_type + +_Python_: + + Struct[String] cell_type(Tile tile) + +_SQL_: `rf_cell_type` + +Get the cell type of the `tile`. Available cell types can be retrieved with the @ref:[cell_types](reference.md#cell-types) function. + +#### convert_cell_type + +_Python_: + + Tile convert_cell_type(Tile tileCol, String cellType) + +_SQL_: `rf_convert_cell_type` + +Convert `tileCol` to a different cell type. + +#### resample + +_Python_: + + Tile resample(Tile tile, Double factor) + Tile resample(Tile tile, Int factor) + Tile resample(Tile tile, Tile shape_tile) + +_SQL_: `rf_resample` + +Change the tile dimension. Passing a numeric `factor` will scale the number of columns and rows in the tile: 1.0 is the same number of columns and row; less than one downsamples the tile; and greater than one upsamples the tile. Passing a `shape_tile` as the second argument outputs `tile` having the same number of columns and rows as `shape_tile`. All resampling is by nearest neighbor method. + +### Tile Creation + +Functions to create a new Tile column, either from scratch or from existing data not yet in a `tile`. + +#### tile_zeros + +_Python_: + +``` +Tile tile_zeros(Int tile_columns, Int tile_rows, String cell_type_name) +``` + +_SQL_: `rf_tile_zeros` + +Create a `tile` of shape `tile_columns` by `tile_rows` full of zeros, with the specified cell type. See function @ref:[`cell_types`](reference.md#cell-types) for valid values. All arguments are literal values and not column expressions. + +#### tile_ones + +_Python_: + +``` +Tile tile_ones(Int tile_columns, Int tile_rows, String cell_type_name) +``` + +_SQL_: `rf_tile_ones` + +Create a `tile` of shape `tile_columns` by `tile_rows` full of ones, with the specified cell type. See function @ref:[`cell_types`](reference.md#cell-types) for valid values. All arguments are literal values and not column expressions. + +#### make_constant_tile + +_Python_: + + Tile make_constant_tile(Numeric constant, Int tile_columns, Int tile_rows, String cell_type_name) + +_SQL_: `rf_make_constant_tile` + +Create a `tile` of shape `tile_columns` by `tile_rows` full of `constant`, with the specified cell type. See function @ref:[`cell_types`](reference.md#cell-types) for valid values. All arguments are literal values and not column expressions. + + +#### rasterize + +_Python_: + + Tile rasterize(Geometry geom, Geometry tile_bounds, Int value, Int tile_columns, Int tile_rows) + +_SQL_: `rf_rasterize` + +Convert a vector Geometry `geom` into a Tile representation. The `value` will be "burned-in" to the returned `tile` where the `geom` intersects the `tile_bounds`. Returned `tile` will have shape `tile_columns` by `tile_rows`. Values outside the `geom` will be assigned a nodata value. Returned `tile` has cell type `int32`, note that `value` is of type Int. + +Parameters `tile_columns` and `tile_rows` are literals, not column expressions. The others are column expressions. + + +Example use. In the code snip below, you can visualize the `tri` and `b` geometries with tools like [Wicket](https://arthur-e.github.io/Wicket/sandbox-gmaps3.html). The result is a right triangle burned into the `tile`, with nodata values shown as ∘. + + +```python +spark.sql(""" +SELECT rf_render_ascii( + rf_rasterize(tri, b, 8, 10, 10)) + +FROM + ( SELECT st_geomFromWKT('POLYGON((1.5 0.5, 1.5 1.5, 0.5 0.5, 1.5 0.5))') AS tri, + st_geomFromWKT('POLYGON((0.0 0.0, 2.0 0.0, 2.0 2.0, 0.0 2.0, 0.0 0.0))') AS b + ) r +""").show(1, False) + +----------- +|∘∘∘∘∘∘∘∘∘∘ +∘∘∘∘∘∘∘∘∘∘ +∘∘∘∘∘∘∘∘∘∘ +∘∘∘∘∘∘∘ ∘∘ +∘∘∘∘∘∘ ∘∘ +∘∘∘∘∘ ∘∘ +∘∘∘∘ ∘∘ +∘∘∘ ∘∘ +∘∘∘∘∘∘∘∘∘∘ +∘∘∘∘∘∘∘∘∘∘| +----------- +``` + + +#### array_to_tile + +_Python_: + + Tile array_to_tile(Array arrayCol, Int numCols, Int numRows) + +Python only. Create a `tile` from a Spark SQL [Array](http://spark.apache.org/docs/2.3.2/api/python/pyspark.sql.html#pyspark.sql.types.ArrayType), filling values in row-major order. + +#### assemble_tile + +_Python_: + + Tile assemble_tile(Int colIndex, Int rowIndex, Numeric cellData, Int numCols, Int numRows, String cellType) + +Python only. Create a Tile from a column of cell data with location indices. This function is the inverse of @ref:[`explode_tiles`](reference.md#explode-tiles). Intended use is with a `groupby`, producing one row with a new `tile` per group. The `numCols`, `numRows` and `cellType` arguments are literal values, others are column expressions. Valid values for `cellType` can be found with function @ref:[`cell_types`](reference.md#cell-types). + +### Masking and Nodata + +In raster operations, the preservation and correct processing of missing operations is very important. The idea of missing data is often expressed as a null or NaN. In raster data, missing observations are often termed NODATA; we will style them as nodata in this document. RasterFrames provides a variety of functions to manage and inspect nodata within `tile`s. + +See also statistical summaries to get the count of data and nodata values per `tile` and aggregate in a `tile` column: @ref:[`data_cells`](reference.md#data-cells), @ref:[`no_data_cells`](reference.md#no-data-cells), @ref:[`agg_data_cells`](reference.md#agg-data-cells), @ref:[`agg_no_data_cells`](reference.md#agg-no-data-cells). + +It is important to note that not all cell types support the nodata representation: these are `bool` and when the cell type string ends in `raw`. + +For integral valued cell types, the nodata is marked by a special sentinel value. This can be a default, typically zero or the minimum value for the underlying data type. The nodata value can also be a user-defined value. For example if the value 4 is to be interpreted as nodata, the cell type will read 'int32ud4'. + +For float cell types, the nodata can either be NaN or a user-defined value; for example `'float32ud-999.9'` would mean the value -999.9 is interpreted as a nodata. + +For more reading about cell types and ndodata, see the [GeoTrellis documentation](https://geotrellis.readthedocs.io/en/latest/guide/core-concepts.html?#working-with-cell-values). + +#### mask + +_Python_: + + Tile mask(Tile tile, Tile mask) + +_SQL_: `rf_mask` + +Where the `mask` contains nodata, replace values in the `tile` with nodata. + +Returned `tile` cell type will be coerced to one supporting nodata if it does not already. + + +#### inverse_mask + +_Python_: + + Tile inverse_mask(Tile tile, Tile mask) + +_SQL_: `rf_inverse_mask` + +Where the `mask` _does not_ contain nodata, replace values in `tile` with nodata. + +#### mask_by_value + +_Python_: + + Tile mask_by_value(Tile data_tile, Tile mask_tile, Int mask_value) + +_SQL_: `rf_mask_by_value` + +Generate a `tile` with the values from `data_tile`, with nodata in cells where the `mask_tile` is equal to `mask_value`. + + +#### is_no_data_tile + +_Python_: + + Boolean is_no_data_tile(tile) + +_SQL_: `rf_is_no_data_tile` + +Returns true if `tile` contains only nodata. By definition returns false if cell type does not support nodata. + +#### with_no_data + +_Python_: + + Tile with_no_data(Tile tile, Double no_data_value) + +Python only. Return a `tile` column marking as nodata all cells equal to `no_data_value`. + +The `no_data_value` argument is a literal Double, not a Column expression. + +If input `tile` had a nodata value already, the behaviour depends on if its cell type is floating point or not. For floating point cell type `tile`, nodata values on the input `tile` remain nodata values on the output. For integral cell type `tile`s, the previous nodata values become literal values. + +### Map Algebra + +[Map algebra](https://gisgeography.com/map-algebra-global-zonal-focal-local/) raster operations are element-wise operations between a `tile` and a scalar, between two `tile`s, or among many `tile`s. + +Some of these functions have similar variations in the Python API: + + - `local_op`: applies `op` to two columns; the right hand side can be a `tile` or a numeric column. + - `local_op_scalar`: applies `op` to a `tile` and a literal scalar, coercing the `tile` to a floating point type + - `local_op_scalar_int`: applies `op` to a `tile` and a literal scalar, without coercing the `tile` to a floating point type + +We will provide all these variations for `local_add` and then suppress the rest in this document. + +The SQL API does not require the `local_op_scalar` or `local_op_scalar_int` forms. + +#### local_add + +_Python_: + + Tile local_add(Tile tile1, Tile rhs) + Tile local_add(Tile tile1, Int rhs) + Tile local_add(Tile tile1, Double rhs) + +_SQL_: `rf_local_add` + +Returns a `tile` column containing the element-wise sum of `tile1` and `rhs`. + +#### local_add_scalar + +_Python_: + + Tile local_add_scalar(Tile tile, Double scalar) + +_SQL_: `rf_local_add_scalar` + +Returns a `tile` column containing the element-wise sum of `tile` and `scalar`. If `tile` is integral type, it will be coerced to floating before addition; returns float valued `tile`. + + +#### local_add_scalar_int + +_Python_: + + Tile local_add_scalar_int(Tile tile, Int scalar) + +_SQL_: `rf_local_add_scalar_int` + +Returns a `tile` column containing the element-wise sum of `tile` and `scalar`. If `tile` is integral type, returns integral type `tile`. + +#### local_subtract + +_Python_: + + Tile local_subtract(Tile tile1, Tile rhs) + Tile local_subtract(Tile tile1, Int rhs) + Tile local_subtract(Tile tile1, Double rhs) + +_SQL_: `rf_local_subtract` + +Returns a `tile` column containing the element-wise difference of `tile1` and `rhs`. + + +#### local_multiply + +_Python_: + + Tile local_multiply(Tile tile1, Tile rhs) + Tile local_multiply(Tile tile1, Int rhs) + Tile local_multiply(Tile tile1, Double rhs) + +_SQL_: `rf_local_multiply` + +Returns a `tile` column containing the element-wise product of `tile1` and `rhs`. This is **not** the matrix multiplication of `tile1` and `rhs`. + + +#### local_divide + +_Python_: + + Tile local_divide(Tile tile1, Tile rhs) + Tile local_divide(Tile tile1, Int rhs) + Tile local_divide(Tile tile1, Double rhs) + +_SQL_: `rf_local_divide` + +Returns a `tile` column containing the element-wise quotient of `tile1` and `rhs`. + + +#### normalized_difference + +_Python_: + + Tile normalized_difference(Tile tile1, Tile tile2) + +_SQL_: `rf_normalized_difference` + +Compute the normalized difference of the the two `tile`s: `(tile1 - tile2) / (tile1 + tile2)`. Result is always floating point cell type. This function has no scalar variant. + +#### local_less + +_Python_: + + Tile local_less(Tile tile1, Tile rhs) + Tile local_less(Tile tile1, Int rhs) + Tile local_less(Tile tile1, Double rhs) + +_SQL_: `rf_less` + +Returns a `tile` column containing the element-wise evaluation of `tile1` is less than `rhs`. + +#### local_less_equal + +_Python_: + + Tile local_less_equal(Tile tile1, Tile rhs) + Tile local_less_equal(Tile tile1, Int rhs) + Tile local_less_equal(Tile tile1, Double rhs) + +_SQL_: `rf_less_equal` + +Returns a `tile` column containing the element-wise evaluation of `tile1` is less than or equal to `rhs`. + +#### local_greater + +_Python_: + + Tile local_greater(Tile tile1, Tile rhs) + Tile local_greater(Tile tile1, Int rhs) + Tile local_greater(Tile tile1, Double rhs) + +_SQL_: `rf_greater` + +Returns a `tile` column containing the element-wise evaluation of `tile1` is greater than `rhs`. + +#### local_greater_equal + +_Python_: + + Tile local_greater_equal(Tile tile1, Tile rhs) + Tile local_greater_equal(Tile tile1, Int rhs) + Tile local_greater_equal(Tile tile1, Double rhs) + +_SQL_: `rf_greater_equal` + +Returns a `tile` column containing the element-wise evaluation of `tile1` is greater than or equal to `rhs`. + +#### local_equal + +_Python_: + + Tile local_equal(Tile tile1, Tile rhs) + Tile local_equal(Tile tile1, Int rhs) + Tile local_equal(Tile tile1, Double rhs) + +_SQL_: `rf_equal` + +Returns a `tile` column containing the element-wise equality of `tile1` and `rhs`. + +#### local_unequal + +_Python_: + + Tile local_unequal(Tile tile1, Tile rhs) + Tile local_unequal(Tile tile1, Int rhs) + Tile local_unequal(Tile tile1, Double rhs) + +_SQL_: `rf_unequal` + +Returns a `tile` column containing the element-wise inequality of `tile1` and `rhs`. + +#### round + +_Python_: + + Tile round(Tile tile) + +_SQL_: `rf_round` + +Round cell values to the nearest integer without changing the cell type. + +#### exp + +_Python_: + + Tile exp(Tile tile) + +_SQL_: `rf_exp` + +Performs cell-wise exponential. + +#### exp10 + +_Python_: + + Tile exp10(Tile tile) + +_SQL_: `rf_exp10` + +Compute 10 to the power of cell values. + +#### exp2 + +_Python_: + + Tile exp2(Tile tile) + +_SQL_: `rf_exp2` + +Compute 2 to the power of cell values. + +#### expm1 + +_Python_: + + Tile expm1(Tile tile) + +_SQL_: `rf_expm1` + +Performs cell-wise exponential, then subtract one. Inverse of @ref:[`log1p`](reference.md#log1p). + +#### log + +_Python_: + + Tile log(Tile tile) + +_SQL_: `rf_log` + +Performs cell-wise natural logarithm. + +#### log10 + +_Python_: + + Tile log10(Tile tile) + +_SQL_: `rf_log10` + +Performs cell-wise logarithm with base 10. + +#### log2 + +_Python_: + + Tile log2(Tile tile) + +_SQL_: `rf_log2` + +Performs cell-wise logarithm with base 2. + +#### log1p + +_Python_: + + Tile log1p(Tile tile) + +_SQL_: `rf_log1p` + +Performs natural logarithm of cell values plus one. Inverse of @ref:[`expm1`](reference.md#expm1). + +### Tile Statistics + +The following functions compute a statistical summary per row of a `tile` column. The statistics are computed across the cells of a single `tile`, within each DataFrame Row. Consider the following example. + +```python +import pyspark.functions as F +spark.sql(""" + SELECT 1 as id, rf_tile_ones(5, 5, 'float32') as t + UNION + SELECT 2 as id, rf_local_multiply(rf_tile_ones(5, 5, 'float32'), 3) as t + """).select(F.col('id'), tile_sum(F.col('t'))).show() + + ++---+-----------+ +| id|tile_sum(t)| ++---+-----------+ +| 2| 75.0| +| 1| 25.0| ++---+-----------+ +``` + + +#### tile_sum + +_Python_: + + Double tile_sum(Tile tile) + +_SQL_: `rf_tile_sum` + +Computes the sum of cells in each row of column `tile`, ignoring nodata values. + +#### tile_mean + +_Python_: + + Double tile_mean(Tile tile) + +_SQL_: `rf_tile_mean` + +Computes the mean of cells in each row of column `tile`, ignoring nodata values. + + +#### tile_min + +_Python_: + + Double tile_min(Tile tile) + +_SQL_: `rf_tile_min` + +Computes the min of cells in each row of column `tile`, ignoring nodata values. + + +#### tile_max + +_Python_: + + Double tile_max(Tile tile) + +_SQL_: `rf_tile_max` + +Computes the max of cells in each row of column `tile`, ignoring nodata values. + + +#### no_data_cells + +_Python_: + + Long no_data_cells(Tile tile) + +_SQL_: `rf_no_data_cells` + +Return the count of nodata cells in the `tile`. + +#### data_cells + +_Python_: + + Long data_cells(Tile tile) + +_SQL_: `rf_data_cells` + +Return the count of data cells in the `tile`. + +#### tile_stats + +_Python_: + + Struct[Long, Long, Double, Double, Double, Double] tile_stats(Tile tile) + +_SQL_: `tile_stats` + +Computes the following statistics of cells in each row of column `tile`: data cell count, nodata cell count, minimum, maximum, mean, and variance. The minimum, maximum, mean, and variance are computed ignoring nodata values. + + +#### tile_histogram + +_Python_: + + Struct[Struct[Long, Long, Double, Double, Double, Double], Array[Struct[Double, Long]]] tile_histogram(Tile tile) + +_SQL_: `rf_tile_histogram` + +Computes a statistical summary of cell values within each row of `tile`. Resulting column has the below schema. Note that several of the other `tile` statistics functions are convenience methods to extract parts of this result. Related is the @ref:[`agg_approx_histogram`](reference.md#agg-approx-histogram) which computes the statistics across all rows in a group. + +``` + |-- tile_histogram: struct (nullable = true) + | |-- stats: struct (nullable = true) + | | |-- dataCells: long (nullable = false) + | | |-- noDataCells: long (nullable = false) + | | |-- min: double (nullable = false) + | | |-- max: double (nullable = false) + | | |-- mean: double (nullable = false) + | | |-- variance: double (nullable = false) + | |-- bins: array (nullable = true) + | | |-- element: struct (containsNull = true) + | | | |-- value: double (nullable = false) + | | | |-- count: long (nullable = false) +``` + +### Aggregate Tile Statistics + +These functions compute statistical summaries over all of the cell values *and* across all the rows in the DataFrame or group. Example use below computes a single double-valued mean per month, across all data cells in the `red_band` `tile` type column. This would return at most twelve rows. + + +```python +from pyspark.functions import month +from pyrasterframes.functions import agg_mean +rf.groupby(month(rf.datetime)).agg(agg_mean(rf.red_band).alias('red_mean_monthly')) +``` + +Continuing our example from the @ref:[Tile Statistics](reference.md#tile-statistics) section, consider the following. Note that only a single row is returned. It is averaging 25 values of 1.0 and 25 values of 3.0, across the fifty cells in two rows. + +```python +spark.sql(""" +SELECT 1 as id, rf_tile_ones(5, 5, 'float32') as t +UNION +SELECT 2 as id, rf_local_multiply_scalar(rf_tile_ones(5, 5, 'float32'), 3) as t +""").agg(agg_mean(F.col('t'))).show(10, False) + ++-----------+ +|agg_mean(t)| ++-----------+ +|2.0 | ++-----------+ +``` + +#### agg_mean + +_Python_: + + Double agg_mean(Tile tile) + +_SQL_: @ref:[`rf_agg_stats`](reference.md#agg-stats)`(tile).mean` + +Aggregates over the `tile` and return the mean of cell values, ignoring nodata. Equivalent to @ref:[`agg_stats`](reference.md#agg-stats)`.mean`. + + +#### agg_data_cells + +_Python_: + + Long agg_data_cells(Tile tile) + +_SQL_: @ref:[`rf_agg_stats`](reference.md#agg-stats)`(tile).dataCells` + +Aggregates over the `tile` and return the count of data cells. Equivalent to @ref:[`agg_stats`](reference.md#agg-stats)`.dataCells`. C.F. `data_cells`; equivalent code: + +```python +rf.select(agg_data_cells(rf.tile).alias('agg_data_cell')).show() +# Equivalent to +rf.agg(F.sum(data_cells(rf.tile)).alias('agg_data_cell')).show() +``` + +#### agg_no_data_cells + +_Python_: + + Long agg_no_data_cells(Tile tile) + +_SQL_: @ref:[`rf_agg_stats`](reference.md#agg-stats)`(tile).noDataCells` + +Aggregates over the `tile` and return the count of nodata cells. Equivalent to @ref:[`agg_stats`](reference.md#agg-stats)`.noDataCells`. C.F. @ref:[`no_data_cells`](reference.md#no-data-cells) a row-wise count of no data cells. + +#### agg_stats + +_Python_: + + Struct[Long, Long, Double, Double, Double, Double] agg_stats(Tile tile) + +_SQL_: `rf_agg_stats` + +Aggregates over the `tile` and returns statistical summaries of cell values: number of data cells, number of nodata cells, minimum, maximum, mean, and variance. The minimum, maximum, mean, and variance ignore the presence of nodata. + +#### agg_approx_histogram + +_Python_: + + Struct[Struct[Long, Long, Double, Double, Double, Double], Array[Struct[Double, Long]]] agg_approx_histogram(Tile tile) + +_SQL_: `rf_agg_approx_histogram` + +Aggregates over the `tile` return statistical summaries of the cell values, including a histogram, in the below schema. The `bins` array is of tuples of histogram values and counts. Typically values are plotted on the x-axis and counts on the y-axis. + +Note that several of the other cell value statistics functions are convenience methods to extract parts of this result. Related is the @ref:[`tile_histogram`](reference.md#tile-histogram) function which operates on a single row at a time. + +``` + |-- agg_approx_histogram: struct (nullable = true) + | |-- stats: struct (nullable = true) + | | |-- dataCells: long (nullable = false) + | | |-- noDataCells: long (nullable = false) + | | |-- min: double (nullable = false) + | | |-- max: double (nullable = false) + | | |-- mean: double (nullable = false) + | | |-- variance: double (nullable = false) + | |-- bins: array (nullable = true) + | | |-- element: struct (containsNull = true) + | | | |-- value: double (nullable = false) + | | | |-- count: long (nullable = false) +``` + +### Tile Local Aggregate Statistics + +Local statistics compute the element-wise statistics across a DataFrame or group of `tile`s, resulting in a `tile` that has the same dimension. + +Consider again our example for Tile Statistics and Aggregate Tile Statistics, this time apply @ref:[`agg_local_mean`](reference.md#agg-local-mean). We see that it is computing the element-wise mean across the two rows. In this case it is computing the mean of one value of 1.0 and one value of 3.0 to arrive at the element-wise mean, but doing so twenty-five times, one for each position in the `tile`. + + +```python +import pyspark.functions as F +lam = spark.sql(""" +SELECT 1 as id, rf_tile_ones(5, 5, 'float32') as t +UNION +SELECT 2 as id, rf_local_multiply(rf_tile_ones(5, 5, 'float32'), 3) as t +""").agg(local_agg_mean(F.col('t')).alias('l')) \ + +## local_agg_mean returns a tile +lam.select(tile_dimensions(lam.l)).show() +## ++------------------+ +|tile_dimensions(l)| ++------------------+ +| [5, 5]| ++------------------+ +## + +lam.select(explode_tiles(lam.l)).show(10, False) +## ++------------+---------+---+ +|column_index|row_index|l | ++------------+---------+---+ +|0 |0 |2.0| +|1 |0 |2.0| +|2 |0 |2.0| +|3 |0 |2.0| +|4 |0 |2.0| +|0 |1 |2.0| +|1 |1 |2.0| +|2 |1 |2.0| +|3 |1 |2.0| +|4 |1 |2.0| ++------------+---------+---+ +only showing top 10 rows +``` + + +#### agg_local_max + +_Python_: + + Tile agg_local_max(Tile tile) + +_SQL_: `rf_agg_local_max` + +Compute the cell-local maximum operation over Tiles in a column. + +#### agg_local_min + +_Python_: + + Tile agg_local_min(Tile tile) + +_SQL_: `rf_agg_local_min` + +Compute the cell-local minimum operation over Tiles in a column. + +#### agg_local_mean + +_Python_: + + Tile agg_local_mean(Tile tile) + +_SQL_: `rf_agg_local_mean` + +Compute the cell-local mean operation over Tiles in a column. + +#### agg_local_data_cells + +_Python_: + + Tile agg_local_data_cells(Tile tile) + +_SQL_: `rf_agg_local_data_cells` + +Compute the cell-local count of data cells over Tiles in a column. Returned `tile` has a cell type of `int32`. + +#### agg_local_no_data_cells + +_Python_: + + Tile agg_local_no_data_cells(Tile tile) + +_SQL_: `rf_agg_local_no_data_cells` + +Compute the cell-local count of nodata cells over Tiles in a column. Returned `tile` has a cell type of `int32`. + +#### agg_local_stats + +_Python_: + + Struct[Tile, Tile, Tile, Tile, Tile] agg_local_stats(Tile tile) + +_SQL_: `rf_agg_local_stats` + +Compute cell-local aggregate count, minimum, maximum, mean, and variance for a column of Tiles. Returns a struct of five `tile`s. + + +### Converting Tiles + +RasterFrames provides several ways to convert a `tile` into other data structures. See also functions for @ref:[creating tiles](reference.md#tile-creation). + +#### explode_tiles + +_Python_: + + Int, Int, Numeric* explode_tiles(Tile* tile) + +_SQL_: `rf_explode_tiles` + +Create a row for each cell in `tile` columns. Many `tile` columns can be passed in, and the returned DataFrame will have one numeric column per input. There will also be columns for `column_index` and `row_index`. Inverse of @ref:[`assemble_tile`](reference.md#assemble-tile). When using this function, be sure to have a unique identifier for rows in order to successfully invert the operation. + +#### explode_tiles_sample + +_Python_: + + Int, Int, Numeric* explode_tiles_sample(Double sample_frac, Long seed, Tile* tile) + +Python only. As with @ref:[`explode_tiles`](reference.md#explode-tiles), but taking a randomly sampled subset of cells. Equivalent to the below, but this implementation is optimized for speed. Parameter `sample_frac` should be between 0.0 and 1.0. + +```python +df.select(df.id, explode_tiles(df.tile1, df.tile2, df.tile3)) \ + .sample(False, 0.05, 8675309) +# Equivalent result, faster +df.select(df.id, explode_tiles_sample(0.05, 8675309, df.tile1, df.tile2, df.tile3)) \ +``` + +#### tile_to_int_array + +_Python_: + + Array tile_to_int_array(Tile tile) + +_SQL_: `rf_tile_to_int_array` + + +Convert Tile column to Spark SQL [Array](http://spark.apache.org/docs/2.3.2/api/python/pyspark.sql.html#pyspark.sql.types.ArrayType), in row-major order. Float cell types will be coerced to integral type by flooring. + + +#### tile_to_double_array + +_Python_: + + Array tile_to_double_arry(Tile tile) + +_SQL_: `rf_tile_to_double_array` + +Convert tile column to Spark [Array](http://spark.apache.org/docs/2.3.2/api/python/pyspark.sql.html#pyspark.sql.types.ArrayType), in row-major order. Integral cell types will be coerced to floats. + + +#### render_ascii + +_Python_: + + String render_ascii(Tile tile) + +_SQL_: `rf_render_ascii` + +Pretty print the tile values as plain text. -For the most up to date list of UDFs, look at API documentation for @scaladoc[`RasterFunctions`][RasterFunctions]. These UDFs are also registered with the SQL engine under the same name but with a `rf_` prefix (e.g. `data_cells` becomes `rf_data_cells`). -The full API documentation can be found [here][scaladoc]. [RasterFunctions]: astraea.spark.rasterframes.RasterFunctions [scaladoc]: latest/api/index.html + diff --git a/docs/src/main/tut/release-notes.md b/docs/src/main/tut/release-notes.md index 3b4daa5ce..c043ac772 100644 --- a/docs/src/main/tut/release-notes.md +++ b/docs/src/main/tut/release-notes.md @@ -4,9 +4,17 @@ ### 0.8.0 +* Added new tile functions `round`, `log`, `log10`, `log2`, `log1p`, `exp`, `exp10`, `exp2`, `expm1`, `resample`, `resample`. * Introduced at the source level the concept of a `RasterSource` and `RasterRef`, enabling lazy/delayed read of sub-scene tiles. * _Deprecation_: Tile column functions (in `RasterFunctions`) and SQL registered names have all been renamed to follow `snake_case` conventions, matching SQL and Python. A temporary compatibility shim is included so that code built against 0.7.1 and earlier still work. These will be marked as deprecated. * Added `withKryoSerialization` extension methods on `SparkSession.Builder` and `SparkConf`. +* _Breaking_: In Scala and SQL, `..._scalar` functions (e.g. `local_add_scalar`) have been removed. Non-scalar forms now dynamically detect type of right hand side. +* _Breaking_: `tileToArray` has been replaced with `tile_to_array_double` and `tile_to_array_int`. +* Added `render_matrix` debugging function. +* _Breaking_: renamed `agg_histogram` to `agg_approx_histogram`, `local_agg_stats` to `agg_local_stats`, `local_agg_max` to `agg_local_max`, `local_agg_min` to `agg_local_min`, `local_agg_mean` to `agg_local_mean`, `local_agg_data_cells` to `agg_local_data_cells`, `local_agg_no_data_cells` to `agg_local_no_data_cells`. +* _Breaking_: `CellHistogram` no longer carries along approximate statistics, due to confusing behavior. Use `agg_stats` instead. +* Introduced `LocalCellStatistics` class to wrap together results from `LocalStatsAggregate`. +* _Breaking_: `TileDimensions` moved from `astraea.spark.rasterframes` to `astraea.spark.rasterframes.model`. ## 0.7.x diff --git a/experimental/src/it/scala/astraea/spark/rasterframes/experimental/datasource/awspds/L8CatalogRelationTest.scala b/experimental/src/it/scala/astraea/spark/rasterframes/experimental/datasource/awspds/L8CatalogRelationTest.scala index 3b5f43f14..e86376246 100644 --- a/experimental/src/it/scala/astraea/spark/rasterframes/experimental/datasource/awspds/L8CatalogRelationTest.scala +++ b/experimental/src/it/scala/astraea/spark/rasterframes/experimental/datasource/awspds/L8CatalogRelationTest.scala @@ -63,7 +63,7 @@ class L8CatalogRelationTest extends TestEnvironment { it("should download geotiff as tiles") { val b01 = scenes .select($"*", read_tiles(l8_band_url("B1"))) - assert(b01.count() === 289) + assert(b01.count() === 1089) } } } diff --git a/experimental/src/it/scala/astraea/spark/rasterframes/experimental/datasource/awspds/L8RelationTest.scala b/experimental/src/it/scala/astraea/spark/rasterframes/experimental/datasource/awspds/L8RelationTest.scala index e6f1ed89d..688866a93 100644 --- a/experimental/src/it/scala/astraea/spark/rasterframes/experimental/datasource/awspds/L8RelationTest.scala +++ b/experimental/src/it/scala/astraea/spark/rasterframes/experimental/datasource/awspds/L8RelationTest.scala @@ -50,10 +50,6 @@ class L8RelationTest extends TestEnvironment with BeforeAndAfterAll with BeforeA scenes = sql(query).cache() } - after { - spark.sparkContext.register() - } - describe("Read L8 on PDS as a DataSource") { it("should count scenes") { assert(scenes.schema.size === 4) diff --git a/experimental/src/main/scala/astraea/spark/rasterframes/experimental/datasource/DownloadExpression.scala b/experimental/src/main/scala/astraea/spark/rasterframes/experimental/datasource/DownloadExpression.scala index 124366dbf..32d55645b 100644 --- a/experimental/src/main/scala/astraea/spark/rasterframes/experimental/datasource/DownloadExpression.scala +++ b/experimental/src/main/scala/astraea/spark/rasterframes/experimental/datasource/DownloadExpression.scala @@ -63,7 +63,7 @@ case class DownloadExpression(override val child: Expression, colPrefix: String) } object DownloadExpression { - import astraea.spark.rasterframes.encoders.SparkDefaultEncoders._ + import astraea.spark.rasterframes.encoders.StandardEncoders.PrimitiveEncoders.arrayEnc def apply(urlColumn: Column): TypedColumn[Any, Array[Byte]] = new Column( diff --git a/experimental/src/main/scala/astraea/spark/rasterframes/experimental/datasource/awspds/L8Relation.scala b/experimental/src/main/scala/astraea/spark/rasterframes/experimental/datasource/awspds/L8Relation.scala index 35337ca31..845f5aad3 100644 --- a/experimental/src/main/scala/astraea/spark/rasterframes/experimental/datasource/awspds/L8Relation.scala +++ b/experimental/src/main/scala/astraea/spark/rasterframes/experimental/datasource/awspds/L8Relation.scala @@ -24,7 +24,7 @@ import astraea.spark.rasterframes._ import astraea.spark.rasterframes.encoders.CatalystSerializer import astraea.spark.rasterframes.encoders.CatalystSerializer._ import astraea.spark.rasterframes.experimental.datasource.awspds.L8Relation.Bands -import astraea.spark.rasterframes.expressions.{RasterSourceToRasterRefs, URIToRasterSource} +import astraea.spark.rasterframes.expressions.transformers.{RasterSourceToRasterRefs, URIToRasterSource} import astraea.spark.rasterframes.ref.RasterRef import astraea.spark.rasterframes.ref.RasterSource.ReadCallback import astraea.spark.rasterframes.rules.SpatialFilters.{Contains, Intersects} diff --git a/experimental/src/main/scala/astraea/spark/rasterframes/experimental/datasource/awspds/package.scala b/experimental/src/main/scala/astraea/spark/rasterframes/experimental/datasource/awspds/package.scala index 7ea77eb07..60b1169fc 100644 --- a/experimental/src/main/scala/astraea/spark/rasterframes/experimental/datasource/awspds/package.scala +++ b/experimental/src/main/scala/astraea/spark/rasterframes/experimental/datasource/awspds/package.scala @@ -21,7 +21,7 @@ package astraea.spark.rasterframes.experimental.datasource import org.apache.spark.sql._ import org.apache.spark.sql.functions._ -import astraea.spark.rasterframes.encoders.SparkDefaultEncoders._ +import astraea.spark.rasterframes.encoders.StandardEncoders.PrimitiveEncoders._ /** * Module support. diff --git a/experimental/src/main/scala/astraea/spark/rasterframes/experimental/datasource/package.scala b/experimental/src/main/scala/astraea/spark/rasterframes/experimental/datasource/package.scala index ea5904f16..4e6129be4 100644 --- a/experimental/src/main/scala/astraea/spark/rasterframes/experimental/datasource/package.scala +++ b/experimental/src/main/scala/astraea/spark/rasterframes/experimental/datasource/package.scala @@ -22,7 +22,7 @@ package astraea.spark.rasterframes.experimental import org.apache.spark.sql._ import org.apache.spark.sql.catalyst.analysis.FunctionRegistry -import org.apache.spark.sql.rf.VersionShims +import org.apache.spark.sql.rf.VersionShims._ /** @@ -42,6 +42,6 @@ package object datasource { // Expression-oriented functions have a different registration scheme // Currently have to register with the `builtin` registry due to Spark data hiding. val registry: FunctionRegistry = rf.registry(sqlContext) - VersionShims.registerExpression(registry, "rf_read_tiles", ReadTilesExpression.apply) + registry.registerExpression[ReadTilesExpression]("rf_read_tiles") } } diff --git a/project/ProjectPlugin.scala b/project/ProjectPlugin.scala index 3a491200d..4f575edb2 100644 --- a/project/ProjectPlugin.scala +++ b/project/ProjectPlugin.scala @@ -40,7 +40,6 @@ object ProjectPlugin extends AutoPlugin { rfSparkVersion in ThisBuild := "2.3.2" , rfGeoTrellisVersion in ThisBuild := "2.1.0", rfGeoMesaVersion in ThisBuild := "2.1.0", - publishTo := sonatypePublishTo.value, publishMavenStyle := true, publishArtifact in (Compile, packageDoc) := true, @@ -67,6 +66,12 @@ object ProjectPlugin extends AutoPlugin { name = "Ben Guseman", email = "bguseman@astraea.io", url = url("http://www.astraea.io") + ), + Developer( + id = "vpipkt", + name = "Jason Brown", + email = "jbrown@astraea.io", + url = url("http://www.astraea.io") ) ), initialCommands in console := @@ -106,17 +111,17 @@ object ProjectPlugin extends AutoPlugin { MergeStrategy.rename case PathList("META-INF", xs @ _*) ⇒ xs map {_.toLowerCase} match { - case ("manifest.mf" :: Nil) | ("index.list" :: Nil) | ("dependencies" :: Nil) ⇒ + case "manifest.mf" :: Nil | "index.list" :: Nil | "dependencies" :: Nil ⇒ MergeStrategy.discard - case ps @ (x :: _) if ps.last.endsWith(".sf") || ps.last.endsWith(".dsa") ⇒ + case ps @ _ :: _ if ps.last.endsWith(".sf") || ps.last.endsWith(".dsa") ⇒ MergeStrategy.discard case "plexus" :: _ ⇒ MergeStrategy.discard case "services" :: _ ⇒ MergeStrategy.filterDistinctLines - case ("spring.schemas" :: Nil) | ("spring.handlers" :: Nil) ⇒ + case "spring.schemas" :: Nil | "spring.handlers" :: Nil ⇒ MergeStrategy.filterDistinctLines - case ("maven" :: rest ) if rest.lastOption.exists(_.startsWith("pom")) ⇒ + case "maven" :: rest if rest.lastOption.exists(_.startsWith("pom")) ⇒ MergeStrategy.discard case _ ⇒ MergeStrategy.deduplicate } @@ -126,8 +131,8 @@ object ProjectPlugin extends AutoPlugin { ) def releaseSettings: Seq[Def.Setting[_]] = { - val buildSite: (State) ⇒ State = releaseStepTask(makeSite in LocalProject("docs")) - val publishSite: (State) ⇒ State = releaseStepTask(ghpagesPushSite in LocalProject("docs")) + val buildSite: State ⇒ State = releaseStepTask(makeSite in LocalProject("docs")) + val publishSite: State ⇒ State = releaseStepTask(ghpagesPushSite in LocalProject("docs")) Seq( releaseIgnoreUntrackedFiles := true, releaseTagName := s"${version.value}", diff --git a/project/plugins.sbt b/project/plugins.sbt index 4d079b71d..a3fd9b0e9 100644 --- a/project/plugins.sbt +++ b/project/plugins.sbt @@ -26,4 +26,5 @@ addSbtPlugin("com.jsuereth" % "sbt-pgp" % "1.1.1") addSbtPlugin("org.spark-packages" % "sbt-spark-package" % "0.2.7-astraea.1") addSbtPlugin("com.eed3si9n" % "sbt-unidoc" % "0.4.1") +addSbtPlugin("net.vonbuchholtz" % "sbt-dependency-check" % "0.2.10") diff --git a/pyrasterframes/python/pyrasterframes/rasterfunctions.py b/pyrasterframes/python/pyrasterframes/rasterfunctions.py index 1f96b3906..589b256d8 100644 --- a/pyrasterframes/python/pyrasterframes/rasterfunctions.py +++ b/pyrasterframes/python/pyrasterframes/rasterfunctions.py @@ -154,7 +154,7 @@ def _(data_tile, mask_tile, mask_value): _rf_unique_functions = { 'array_to_tile': _create_arrayToTile(), 'assemble_tile': _create_assembleTile(), - 'cellTypes': lambda: _context_call('cellTypes'), + 'cell_types': lambda: _context_call('cell_types'), 'convert_cell_type': _create_convertCellType(), 'explode_tiles': _create_explode_tiles(), 'explode_tiles_sample': _create_explode_tiles_sample(), @@ -196,10 +196,13 @@ def _(data_tile, mask_tile, mask_value): # ------- RasterFrames functions ------- 'tile_dimensions': 'Query the number of (cols, rows) in a Tile.', 'envelope': 'Extracts the bounding box (envelope) of the geometry.', - 'tile_to_int_array': 'Flattens Tile into an array of integers.', - 'tile_to_double_array': 'Flattens Tile into an array of doubles.', + 'tile_to_int_array': 'Flattens Tile into an array of integers. Deprecated in favor of `tile_to_array_int`.', + 'tile_to_double_array': 'Flattens Tile into an array of doubles. Deprecated in favor of `tile_to_array_double`', + 'tile_to_array_int': 'Flattens Tile into an array of integers.', + 'tile_to_array_double': 'Flattens Tile into an array of doubles.', 'cell_type': 'Extract the Tile\'s cell type', - 'agg_histogram': 'Compute the full column aggregate floating point histogram', + 'is_no_data_tile': 'Report if the Tile is entirely NODDATA cells', + 'agg_approx_histogram': 'Compute the full column aggregate floating point histogram', 'agg_stats': 'Compute the full column aggregate floating point statistics', 'agg_mean': 'Computes the column aggregate mean', 'agg_data_cells': 'Computes the number of non-NoData cells in a column', @@ -219,11 +222,11 @@ def _(data_tile, mask_tile, mask_value): 'local_divide': 'Divide two Tiles', 'normalized_difference': 'Compute the normalized difference of two tiles', 'local_agg_stats': 'Compute cell-local aggregate descriptive statistics for a column of Tiles.', - 'local_agg_max': 'Compute the cell-wise/local max operation between Tiles in a column.', - 'local_agg_min': 'Compute the cellwise/local min operation between Tiles in a column.', - 'local_agg_mean': 'Compute the cellwise/local mean operation between Tiles in a column.', - 'local_agg_data_cells': 'Compute the cellwise/local count of non-NoData cells for all Tiles in a column.', - 'local_agg_no_data_cells': 'Compute the cellwise/local count of NoData cells for all Tiles in a column.', + 'agg_local_max': 'Compute the cell-wise/local max operation between Tiles in a column.', + 'agg_local_min': 'Compute the cellwise/local min operation between Tiles in a column.', + 'agg_local_mean': 'Compute the cellwise/local mean operation between Tiles in a column.', + 'agg_local_data_cells': 'Compute the cellwise/local count of non-NoData cells for all Tiles in a column.', + 'agg_local_no_data_cells': 'Compute the cellwise/local count of NoData cells for all Tiles in a column.', 'mask': 'Where the mask (second) tile contains NODATA, replace values in the source (first) tile with NODATA.', 'inverse_mask': 'Where the mask (second) tile DOES NOT contain NODATA, replace values in the source (first) tile with NODATA.', 'local_less': 'Cellwise less than comparison between two tiles', @@ -232,6 +235,17 @@ def _(data_tile, mask_tile, mask_value): 'local_greater_equal': 'Cellwise greater than or equal to comparison between two tiles', 'local_equal': 'Cellwise equality comparison between two tiles', 'local_unequal': 'Cellwise inequality comparison between two tiles', + 'round': 'Round cell values to the nearest integer without changing the cell type', + 'log': 'Performs cell-wise natural logarithm', + 'log10': 'Performs cell-wise logartithm with base 10', + 'log2': 'Performs cell-wise logartithm with base 2', + 'log1p': 'Performs natural logarithm of cell values plus one', + 'exp': 'Performs cell-wise exponential', + 'exp2': 'Compute 2 to the power of cell values', + 'exp10': 'Compute 10 to the power of cell values', + 'expm1': 'Performs cell-wise exponential, then subtract one', + 'resample': 'Resample tile to different size based on scalar factor or tile whose dimension to match', + # ------- JTS functions ------- # spatial constructors 'st_geomFromGeoHash': '', diff --git a/pyrasterframes/python/pyrasterframes/types.py b/pyrasterframes/python/pyrasterframes/types.py index c4c509f4a..b82cfb70c 100644 --- a/pyrasterframes/python/pyrasterframes/types.py +++ b/pyrasterframes/python/pyrasterframes/types.py @@ -160,15 +160,14 @@ def module(cls): def scalaUDT(cls): return 'org.apache.spark.sql.rf.TileUDT' + # NB: These will need implementations if UDFs are to be supported, + # preferably in numpy arrays. def serialize(self, obj): if (obj is None): return None - return Row(obj.cellType().name().encode("UTF8"), - obj.cols().toShort(), - obj.rows().toShort(), - obj.toBytes) + return None def deserialize(self, datum): - return RFContext._jvm_mirror().generate_tile(datum[0], datum[1], datum[2], datum[3]) + return None diff --git a/pyrasterframes/python/tests/PyRasterFramesTests.py b/pyrasterframes/python/tests/PyRasterFramesTests.py index 954af08f3..b47e25577 100644 --- a/pyrasterframes/python/tests/PyRasterFramesTests.py +++ b/pyrasterframes/python/tests/PyRasterFramesTests.py @@ -98,7 +98,11 @@ def test_general(self): .withColumn('sum', tile_sum(self.tileCol)) \ .withColumn('stats', tile_stats(self.tileCol)) \ .withColumn('envelope', envelope('bounds')) \ - .withColumn('ascii', render_ascii(self.tileCol)) + .withColumn('ascii', render_ascii(self.tileCol)) \ + .withColumn('log', log(self.tileCol)) \ + .withColumn('exp', exp(self.tileCol)) \ + .withColumn('expm1', expm1(self.tileCol)) \ + .withColumn('round', round(self.tileCol)) df.show() @@ -117,7 +121,7 @@ def test_aggregations(self): agg_data_cells(self.tileCol), agg_no_data_cells(self.tileCol), agg_stats(self.tileCol), - agg_histogram(self.tileCol) + agg_approx_histogram(self.tileCol) ) aggs.show() row = aggs.first() @@ -126,7 +130,7 @@ def test_aggregations(self): print(row['agg_data_cells(tile)']) self.assertEqual(row['agg_data_cells(tile)'], 387000) self.assertEqual(row['agg_no_data_cells(tile)'], 1000) - self.assertEqual(row['agg_stats(tile)'].dataCells, row['agg_data_cells(tile)']) + self.assertEqual(row['agg_stats(tile)'].data_cells, row['agg_data_cells(tile)']) def test_sql(self): @@ -190,17 +194,28 @@ def test_maskByValue(self): mask_value = 4 rf1 = self.rf.select(self.rf.tile, - local_multiply_scalar_int( + local_multiply( convert_cell_type( local_greater_scalar_int(self.rf.tile, 25000), "uint8"), - mask_value).alias('mask')) + lit(mask_value)).alias('mask')) rf2 = rf1.select(rf1.tile, mask_by_value(rf1.tile, rf1.mask, lit(mask_value)).alias('masked')) result = rf2.agg(agg_no_data_cells(rf2.tile) < agg_no_data_cells(rf2.masked)) \ .collect()[0][0] self.assertTrue(result) + def test_resample(self): + from pyspark.sql.functions import lit + result = self.rf.select( + tile_min(local_equal( + resample(resample(self.rf.tile, lit(2)), lit(0.5)), + self.rf.tile)) + ).collect()[0][0] + + self.assertTrue(result == 1) # short hand for all values are true + + def suite(): functionTests = unittest.TestSuite() functionTests.addTest(RasterFunctionsTest('test_identify_columns')) @@ -212,6 +227,7 @@ def suite(): functionTests.addTest(RasterFunctionsTest('test_explode')) functionTests.addTest(RasterFunctionsTest('test_sql')) functionTests.addTest(RasterFunctionsTest('test_maskByValue')) + functionTests.addTest(RasterFunctionsTest('test_resample')) return functionTests diff --git a/pyrasterframes/src/main/scala/astraea/spark/rasterframes/py/PyRFContext.scala b/pyrasterframes/src/main/scala/astraea/spark/rasterframes/py/PyRFContext.scala index e4255d515..b80a8a3f8 100644 --- a/pyrasterframes/src/main/scala/astraea/spark/rasterframes/py/PyRFContext.scala +++ b/pyrasterframes/src/main/scala/astraea/spark/rasterframes/py/PyRFContext.scala @@ -94,7 +94,13 @@ class PyRFContext(implicit sparkSession: SparkSession) extends RasterFunctions */ def cell_type(name: String): CellType = CellType.fromName(name) - def cell_types: Seq[String] = astraea.spark.rasterframes.functions.cellTypes() + /** + * Convenience list of valid cell type strings + * @return Java List of String, which py4j can interpret as a python `list` + */ + def cell_types = { + astraea.spark.rasterframes.functions.cellTypes().asJava + } /** DESERIALIZATION **/ @@ -117,51 +123,51 @@ class PyRFContext(implicit sparkSession: SparkSession) extends RasterFunctions def temporalKeyColumn(df: DataFrame): Column = df.asRF.temporalKeyColumn.orNull - def tile_to_int_array(col: Column): Column = tile_to_array[Int](col) + def tile_to_int_array(col: Column): Column = tile_to_array_int(col) - def tile_to_double_array(col: Column): Column = tile_to_array[Double](col) + def tile_to_double_array(col: Column): Column = tile_to_array_double(col) // All the scalar tile arithmetic functions - def local_add_scalar(col: Column, scalar: Double): Column = local_add_scalar[Double](col, scalar) + def local_add_scalar(col: Column, scalar: Double): Column = local_add[Double](col, scalar) - def local_add_scalar_int(col: Column, scalar: Int): Column = local_add_scalar[Int](col, scalar) + def local_add_scalar_int(col: Column, scalar: Int): Column = local_add[Int](col, scalar) - def local_subtract_scalar(col: Column, scalar: Double): Column = local_subtract_scalar[Double](col, scalar) + def local_subtract_scalar(col: Column, scalar: Double): Column = local_subtract[Double](col, scalar) - def local_subtract_scalar_int(col: Column, scalar: Int): Column = local_subtract_scalar[Int](col, scalar) + def local_subtract_scalar_int(col: Column, scalar: Int): Column = local_subtract[Int](col, scalar) - def local_divide_scalar(col: Column, scalar: Double): Column = local_divide_scalar[Double](col, scalar) + def local_divide_scalar(col: Column, scalar: Double): Column = local_divide[Double](col, scalar) - def local_divide_scalar_int(col: Column, scalar: Int): Column = local_divide_scalar[Int](col, scalar) + def local_divide_scalar_int(col: Column, scalar: Int): Column = local_divide[Int](col, scalar) - def local_multiply_scalar(col: Column, scalar: Double): Column = local_multiply_scalar[Double](col, scalar) + def local_multiply_scalar(col: Column, scalar: Double): Column = local_multiply[Double](col, scalar) - def local_multiply_scalar_int(col: Column, scalar: Int): Column = local_multiply_scalar[Int](col, scalar) + def local_multiply_scalar_int(col: Column, scalar: Int): Column = local_multiply[Int](col, scalar) - def local_less_scalar(col: Column, scalar: Double): Column = local_less_scalar[Double](col, scalar) + def local_less_scalar(col: Column, scalar: Double): Column = local_less[Double](col, scalar) - def local_less_scalar_int(col: Column, scalar: Int): Column = local_less_scalar[Int](col, scalar) + def local_less_scalar_int(col: Column, scalar: Int): Column = local_less[Int](col, scalar) - def local_less_equal_scalar(col: Column, scalar: Double): Column = local_less_equal_scalar[Double](col, scalar) + def local_less_equal_scalar(col: Column, scalar: Double): Column = local_less_equal[Double](col, scalar) - def local_less_equal_scalar_int(col: Column, scalar: Int): Column = local_less_equal_scalar[Int](col, scalar) + def local_less_equal_scalar_int(col: Column, scalar: Int): Column = local_less_equal[Int](col, scalar) - def local_greater_scalar(col: Column, scalar: Double): Column = local_greater_scalar[Double](col, scalar) + def local_greater_scalar(col: Column, scalar: Double): Column = local_greater[Double](col, scalar) - def local_greater_scalar_int(col: Column, scalar: Int): Column = local_greater_scalar[Int](col, scalar) + def local_greater_scalar_int(col: Column, scalar: Int): Column = local_greater[Int](col, scalar) - def local_greater_equal_scalar(col: Column, scalar: Double): Column = local_greater_equal_scalar[Double](col, scalar) + def local_greater_equal_scalar(col: Column, scalar: Double): Column = local_greater_equal[Double](col, scalar) - def local_greater_equal_scalar_int(col: Column, scalar: Int): Column = local_greater_equal_scalar[Int](col, scalar) + def local_greater_equal_scalar_int(col: Column, scalar: Int): Column = local_greater_equal[Int](col, scalar) - def local_equal_scalar(col: Column, scalar: Double): Column = local_equal_scalar[Double](col, scalar) + def local_equal_scalar(col: Column, scalar: Double): Column = local_equal[Double](col, scalar) - def local_equal_scalar_int(col: Column, scalar: Int): Column = local_equal_scalar[Int](col, scalar) + def local_equal_scalar_int(col: Column, scalar: Int): Column = local_equal[Int](col, scalar) - def local_unequal_scalar(col: Column, scalar: Double): Column = local_unequal_scalar[Double](col, scalar) + def local_unequal_scalar(col: Column, scalar: Double): Column = local_unequal[Double](col, scalar) - def local_unequal_scalar_int(col: Column, scalar: Int): Column = local_unequal_scalar[Int](col, scalar) + def local_unequal_scalar_int(col: Column, scalar: Int): Column = local_unequal[Int](col, scalar) // return toRaster, get just the tile, and make an array out of it def toIntRaster(df: DataFrame, colname: String, cols: Int, rows: Int): Array[Int] = { diff --git a/version.sbt b/version.sbt index a0bf63cbb..eb883072c 100644 --- a/version.sbt +++ b/version.sbt @@ -1 +1 @@ -version in ThisBuild := "0.8.0-RC3" +version in ThisBuild := "0.8.0-RC4"