[SQL-DS-CACHE-36][POAE7-898]HCFS docs for OAP 1.1 (#37)

* [SQL-DS-CACHE-36][POAE7-898]HCFS docs for OAP 1.1 * address comment * address comment * Replace white/black with allow/deny
oap-project · Mar 19, 2021 · d98a0d7 · d98a0d7
1 parent f0cd386
commit d98a0d7
Show file tree

Hide file tree

Showing 3 changed files with 84 additions and 18 deletions.
diff --git a/HCFS-based-cache/src/main/java/com/intel/oap/fs/hadoop/cachedfs/CachedInputStream.java b/HCFS-based-cache/src/main/java/com/intel/oap/fs/hadoop/cachedfs/CachedInputStream.java
@@ -68,9 +68,9 @@ public class CachedInputStream extends FSInputStream {
   private int cacheHitCount = 0;
   private List<PMemBlock> cachedBlocks = new ArrayList<>();
 
-  // white list and black list regular expressions that decide whether to cache or not
-  private String cacheWhiteListRegexp;
-  private String cacheBlackListRegexp;
+  // allow list and deny list regular expressions that decide whether to cache or not
+  private String cacheAllowListRegexp;
+  private String cacheDenyListRegexp;
   private boolean fileShouldBeCached;
 
   public CachedInputStream(FSDataInputStream hdfsInputStream, Configuration conf,
@@ -93,22 +93,22 @@ public CachedInputStream(FSDataInputStream hdfsInputStream, Configuration conf,
     this.statisticsStore = new RedisGlobalPMemCacheStatisticsStore(conf);
     this.ids = new ObjectId[(int)((contentLength + pmemCachedBlockSize - 1) / pmemCachedBlockSize)];
 
-    cacheWhiteListRegexp = conf.get(Constants.CONF_KEY_CACHE_WHITE_LIST_REGEXP,
-            Constants.DEFAULT_CACHE_WHITE_LIST_REGEXP);
+    cacheAllowListRegexp = conf.get(Constants.CONF_KEY_CACHE_ALLOW_LIST_REGEXP,
+            Constants.DEFAULT_CACHE_ALLOW_LIST_REGEXP);
 
-    cacheBlackListRegexp = conf.get(Constants.CONF_KEY_CACHE_BLACK_LIST_REGEXP,
-            Constants.DEFAULT_CACHE_BLACK_LIST_REGEXP);
+    cacheDenyListRegexp = conf.get(Constants.CONF_KEY_CACHE_DENY_LIST_REGEXP,
+            Constants.DEFAULT_CACHE_DENY_LIST_REGEXP);
 
     fileShouldBeCached = checkFileShouldBeCached();
 
     LOG.info("Opening file: {} for reading. fileShouldBeCached: {}", path, fileShouldBeCached);
   }
 
   private boolean checkFileShouldBeCached() {
-    return (cacheWhiteListRegexp.isEmpty()
-            || Pattern.compile(cacheWhiteListRegexp).matcher(path.toString()).find())
-            && (cacheBlackListRegexp.isEmpty()
-            || !Pattern.compile(cacheBlackListRegexp).matcher(path.toString()).find());
+    return (cacheAllowListRegexp.isEmpty()
+            || Pattern.compile(cacheAllowListRegexp).matcher(path.toString()).find())
+            && (cacheDenyListRegexp.isEmpty()
+            || !Pattern.compile(cacheDenyListRegexp).matcher(path.toString()).find());
   }
 
   private void advanceCachePosition(long pos) {
@@ -243,7 +243,7 @@ private boolean ensureDataInCache() throws IOException {
           LOG.warn("exception, data not cached to pmem for block: {}", currentBlock);
         }
       } else {
-        LOG.debug("data will not be cached since it's in blacklist or it's already cached: {}",
+        LOG.debug("data will not be cached since it's in denylist or it's already cached: {}",
                   currentBlock);
       }
     }

diff --git a/HCFS-based-cache/src/main/java/com/intel/oap/fs/hadoop/cachedfs/Constants.java b/HCFS-based-cache/src/main/java/com/intel/oap/fs/hadoop/cachedfs/Constants.java
@@ -71,17 +71,17 @@ public class Constants {
     public static final String CACHE_LOCATION_POLICY_HDFS_ONLY = "hdfs_only";
 
     // regular expression that contains patterns of paths which will be cached.
-    // files will not be cached when their paths match black list regexp.
+    // files will not be cached when their paths match deny list regexp.
     // an empty regexp results in matching everything.
     // eg. cachedFs://localhost:9000/dir/
-    public static final String CONF_KEY_CACHE_WHITE_LIST_REGEXP = "fs.cachedFs.whiteList.regexp";
+    public static final String CONF_KEY_CACHE_ALLOW_LIST_REGEXP = "fs.cachedFs.allowlist.regexp";
 
-    public static final String DEFAULT_CACHE_WHITE_LIST_REGEXP = ".*";
+    public static final String DEFAULT_CACHE_ALLOW_LIST_REGEXP = ".*";
 
     // regular expression that contains patterns of paths which will not be cached.
-    // an empty regexp results in no matching of black list.
+    // an empty regexp results in no matching of deny list.
     // eg. io_data|io_control
-    public static final String CONF_KEY_CACHE_BLACK_LIST_REGEXP = "fs.cachedFs.blacklist.regexp";
+    public static final String CONF_KEY_CACHE_DENY_LIST_REGEXP = "fs.cachedFs.denylist.regexp";
 
-    public static final String DEFAULT_CACHE_BLACK_LIST_REGEXP = "";
+    public static final String DEFAULT_CACHE_DENY_LIST_REGEXP = "";
 }
diff --git a/docs/HCFS-User-Guide.md b/docs/HCFS-User-Guide.md
@@ -0,0 +1,66 @@
+# HCFS User Guide
+
+* [Prerequisites](#prerequisites)
+* [Configurations](#configuration)
+
+## Prerequisites
+
+HCFS based Data Source Cache on Spark 3.0.0 requires a working Hadoop cluster with YARN and Spark. Running Spark on YARN requires a binary distribution of Spark, which is built with YARN support. The HCFS based Data Source Cache also need to install plasma and redis, please follow [OAP-Installation-Guide](OAP-Installation-Guide.md) for how to install plasma and redis.
+
+## Configurations
+
+### Spark Configurations
+
+Before you run `$SPARK_HOME/bin/spark-shell `, you need to configure Spark for integration. You need to add or update the following configurations in the Spark configuration file `$SPARK_HOME/conf/spark-defaults.conf` on your working node.
+
+```bash
+spark.hadoop.fs.cachedFs.impl com.intel.oap.fs.hadoop.cachedfs.CachedFileSystem
+# absolute path of the jar on your working node
+spark.files                       /path/to/hcfs-sql-ds-cache-<version>.jar
+# relative path to spark.files, just specify jar name in current dir
+spark.executor.extraClassPath     ./hcfs-sql-cache-<version>.jar
+# absolute path of the jar on your working node
+spark.driver.extraClassPath       /path/to/hcfs-sql-ds-cache-<version>.jar
+```
+
+### Redis Configuration
+
+Add the following configuration to `$SPARK_HOME/conf/spark-defaults.conf`.
+
+```
+spark.hadoop.fs.cachedFs.redis.host $HOST
+spark.hadoop.fs.cachedFs.redis.port $PORT
+```
+
+### Configuration for HCFS cache location policy
+
+We provide three HCFS cache location policies, you can choose the best one for you workload
+* defalut policy
+This policy the file block locations consist of cached blocks and hdfs blocks (if cached blocks are incomplete)
+* cache_over_hdfs
+This policy use cached block location only if all requested content is cached, otherwise use HDFS block locations
+* hdfs_only
+This policy will ignoring cached blocks when finding file block locations
+
+Add the following configuration to `$SPARK_HOME/conf/spark-defaults.conf`.
+
+```
+spark.hadoop.fs.cachedFs.blockLocation.policy  default or cache_over_hdfs or hdfs_only
+```
+
+## Configuration for HCFS cache path pattern
+
+We provide HCFS cache patterns for paths to determine wherthe path will be cached
+* allowlist
+The path match the pattens will be cached. An empty regexp results in matching everything.
+eg. cachedFs://localhost:9000/dir/
+* denylist
+The path match the pattens will not be cached. An empty regexp results in no matching of deny list.
+eg. io_data|io_control
+
+Add the following configuration to `$SPARK_HOME/conf/spark-defaults.conf`.
+
+```
+spark.hadoop.fs.cachedFs.allowList.regexp  $PATTEN
+spark.hadoop.fs.cachedFs.denylist.regexp  $PATTERN
+```