diff --git a/.gitignore b/.gitignore index 1e2a1f6c8..84f6d75b1 100644 --- a/.gitignore +++ b/.gitignore @@ -16,12 +16,11 @@ target # Oozie build core/pig* -build -mem +core/hive-* core/build core/mem examples/oozietests mkdistro-*.out distro/downloads SecurityAuth.audit - +build diff --git a/client/pom.xml b/client/pom.xml index cffe29fe0..161eb3a4e 100644 --- a/client/pom.xml +++ b/client/pom.xml @@ -57,6 +57,16 @@ javax.persistence persistence-api provided + + + commons-lang + commons-lang + + + + + commons-lang + commons-lang diff --git a/client/src/main/java/org/apache/oozie/cli/OozieCLI.java b/client/src/main/java/org/apache/oozie/cli/OozieCLI.java index 8132e4c4a..84593ee64 100644 --- a/client/src/main/java/org/apache/oozie/cli/OozieCLI.java +++ b/client/src/main/java/org/apache/oozie/cli/OozieCLI.java @@ -1169,6 +1169,8 @@ private void validateCommand(CommandLine commandLine) throws OozieCLIException { List sources = new ArrayList(); sources.add(new StreamSource(Thread.currentThread().getContextClassLoader().getResourceAsStream( "oozie-workflow-0.1.xsd"))); + sources.add(new StreamSource(Thread.currentThread().getContextClassLoader().getResourceAsStream( + "hive-action-0.2.xsd"))); SchemaFactory factory = SchemaFactory.newInstance(XMLConstants.W3C_XML_SCHEMA_NS_URI); Schema schema = factory.newSchema(sources.toArray(new StreamSource[sources.size()])); Validator validator = schema.newValidator(); diff --git a/client/src/main/resources/hive-action-0.2.xsd b/client/src/main/resources/hive-action-0.2.xsd new file mode 100644 index 000000000..dde95b99f --- /dev/null +++ b/client/src/main/resources/hive-action-0.2.xsd @@ -0,0 +1,65 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/core/pom.xml b/core/pom.xml index 661fce131..ddc03895d 100644 --- a/core/pom.xml +++ b/core/pom.xml @@ -60,7 +60,18 @@ javax.persistence persistence-api - compile + provided + + + commons-lang + commons-lang + + + + + + commons-lang + commons-lang @@ -76,14 +87,12 @@ - - com.yahoo.hadoop + org.apache.hadoop hadoop-core provided - - com.yahoo.hadoop + org.apache.hadoop hadoop-test test @@ -92,6 +101,12 @@ com.yahoo.oozie oozie-sharelib provided + + + org.datanucleus + datanucleus-enhancer + + diff --git a/core/src/main/conf/oozie-site.xml b/core/src/main/conf/oozie-site.xml index c0544376a..5bd835830 100644 --- a/core/src/main/conf/oozie-site.xml +++ b/core/src/main/conf/oozie-site.xml @@ -16,6 +16,22 @@ + + + + + + + + + + mapred.reduce.tasks + -1 + The default number of reduce tasks per job. Typically set + to a prime close to the number of available hosts. Ignored when + mapred.job.tracker is "local". Hadoop set this to 1 by default, whereas hive uses -1 as its default value. + By setting this property to -1, Hive will automatically figure out what should be the number of reducers. + + + + + hive.exec.reducers.bytes.per.reducer + 1000000000 + size per reducer.The default is 1G, i.e if the input size is 10G, it will use 10 reducers. + + + + hive.exec.reducers.max + 999 + max number of reducers will be used. If the one + specified in the configuration parameter mapred.reduce.tasks is + negative, hive will use this one as the max number of reducers when + automatically determine number of reducers. + + + + hive.exec.scratchdir + /tmp/hive-${user.name} + Scratch space for Hive jobs + + + + hive.test.mode + false + whether hive is running in test mode. If yes, it turns on sampling and prefixes the output tablename + + + + hive.test.mode.prefix + test_ + if hive is running in test mode, prefixes the output table by this string + + + + + + + + + + + hive.test.mode.samplefreq + 32 + if hive is running in test mode and table is not bucketed, sampling frequency + + + + hive.test.mode.nosamplelist + + if hive is running in test mode, dont sample the above comma seperated list of tables + + + + hive.metastore.local + true + controls whether to connect to remove metastore server or open a new metastore server in Hive Client JVM + + + + javax.jdo.option.ConnectionURL + jdbc:derby:;databaseName=metastore_db;create=true + JDBC connect string for a JDBC metastore + + + + javax.jdo.option.ConnectionDriverName + org.apache.derby.jdbc.EmbeddedDriver + Driver class name for a JDBC metastore + + + + javax.jdo.PersistenceManagerFactoryClass + org.datanucleus.jdo.JDOPersistenceManagerFactory + class implementing the jdo persistence + + + + javax.jdo.option.DetachAllOnCommit + true + detaches all objects from session so that they can be used after transaction is committed + + + + javax.jdo.option.NonTransactionalRead + true + reads outside of transactions + + + + javax.jdo.option.ConnectionUserName + APP + username to use against metastore database + + + + + + + + + + datanucleus.validateTables + false + validates existing schema against code. turn this on if you want to verify existing schema + + + + datanucleus.validateColumns + false + validates existing schema against code. turn this on if you want to verify existing schema + + + + datanucleus.validateConstraints + false + validates existing schema against code. turn this on if you want to verify existing schema + + + + datanucleus.storeManagerType + rdbms + metadata store type + + + + datanucleus.autoCreateSchema + true + creates necessary schema on a startup if one doesn't exist. set this to false, after creating it once + + + + datanucleus.autoStartMechanismMode + checked + throw exception if metadata tables are incorrect + + + + datancucleus.transactionIsolation + read-committed + + + + + datanuclues.cache.level2 + true + use a level 2 cache. turn this off if metadata is changed independently of hive metastore server + + + + datanuclues.cache.level2.type + SOFT + SOFT=soft reference based cache, WEAK=weak reference based cache. + + + + hive.metastore.warehouse.dir + /user/hive/warehouse + location of default database for the warehouse + + + + hive.metastore.connect.retries + 5 + Number of retries while opening a connection to metastore + + + + hive.metastore.rawstore.impl + org.apache.hadoop.hive.metastore.ObjectStore + Name of the class that implements org.apache.hadoop.hive.metastore.rawstore interface. This class is used to store and retrieval of raw metadata objects such as table, database + + + + hive.default.fileformat + TextFile + Default file format for CREATE TABLE statement. Options are TextFile and SequenceFile. Users can explicitly say CREATE TABLE ... STORED AS <TEXTFILE|SEQUENCEFILE> to override + + + + hive.fileformat.check + true + Whether to check file format or not when loading data files + + + + hive.map.aggr + true + Whether to use map-side aggregation in Hive Group By queries + + + + hive.groupby.skewindata + false + Whether there is skew in data to optimize group by queries + + + + hive.groupby.mapaggr.checkinterval + 100000 + Number of rows after which size of the grouping keys/aggregation classes is performed + + + + hive.mapred.local.mem + 0 + For local mode, memory of the mappers/reducers + + + + hive.map.aggr.hash.percentmemory + 0.5 + Portion of total memory to be used by map-side grup aggregation hash table + + + + hive.map.aggr.hash.min.reduction + 0.5 + Hash aggregation will be turned off if the ratio between hash + table size and input rows is bigger than this number. Set to 1 to make sure + hash aggregation is never turned off. + + + + hive.optimize.cp + true + Whether to enable column pruner + + + + hive.optimize.ppd + true + Whether to enable predicate pushdown + + + + hive.optimize.pruner + true + Whether to enable the new partition pruner which depends on predicate pushdown. If this is disabled, + the old partition pruner which is based on AST will be enabled. + + + + hive.optimize.groupby + true + Whether to enable the bucketed group by from bucketed partitions/tables. + + + + hive.join.emit.interval + 1000 + How many rows in the right-most join operand Hive should buffer before emitting the join result. + + + + hive.join.cache.size + 25000 + How many rows in the joining tables (except the streaming table) should be cached in memory. + + + + hive.mapjoin.bucket.cache.size + 100 + How many values in each keys in the map-joined table should be cached in memory. + + + + hive.mapjoin.maxsize + 100000 + Maximum # of rows of the small table that can be handled by map-side join. If the size is reached and hive.task.progress is set, a fatal error counter is set and the job will be killed. + + + + hive.mapjoin.cache.numrows + 25000 + How many rows should be cached by jdbm for map join. + + + + hive.mapred.mode + nonstrict + The mode in which the hive operations are being performed. In strict mode, some risky queries are not allowed to run + + + + hive.exec.script.maxerrsize + 100000 + Maximum number of bytes a script is allowed to emit to standard error (per map-reduce task). This prevents runaway scripts from filling logs partitions to capacity + + + + hive.exec.script.allow.partial.consumption + false + When enabled, this option allows a user script to exit successfully without consuming all the data from the standard input. + + + + + hive.script.operator.id.env.var + HIVE_SCRIPT_OPERATOR_ID + Name of the environment variable that holds the unique script operator ID in the user's transform function (the custom mapper/reducer that the user has specified in the query) + + + + + hive.exec.compress.output + false + This controls whether the final outputs of a query (to a local/hdfs file or a hive table) is compressed. The compression codec and other options are determined from hadoop config variables mapred.output.compress* + + + + hive.exec.compress.intermediate + false + This controls whether intermediate files produced by hive between multiple map-reduce jobs are compressed. The compression codec and other options are determined from hadoop config variables mapred.output.compress* + + + + hive.exec.parallel + false + Whether to execute jobs in parallel + + + + hive.hwi.war.file + lib/hive-hwi-0.5.0+20.war + This sets the path to the HWI war file, relative to ${HIVE_HOME}. + + + + hive.hwi.listen.host + 0.0.0.0 + This is the host address the Hive Web Interface will listen on + + + + hive.hwi.listen.port + 9999 + This is the port the Hive Web Interface will listen on + + + + hive.exec.pre.hooks + + Pre Execute Hook for Tests + + + + hive.merge.mapfiles + true + Merge small files at the end of a map-only job + + + + hive.merge.mapredfiles + false + Merge small files at the end of any job(map only or map-reduce) + + + + hive.heartbeat.interval + 1000 + Send a heartbeat after this interval - used by mapjoin and filter operators + + + + hive.merge.size.per.task + 256000000 + Size of merged files at the end of the job + + + + hive.script.auto.progress + false + Whether Hive Tranform/Map/Reduce Clause should automatically send progress information to TaskTracker to avoid the task getting killed because of inactivity. Hive sends progress information when the script is outputting to stderr. This option removes the need of periodically producing stderr messages, but users should be cautious because this may prevent infinite loops in the scripts to be killed by TaskTracker. + + + + hive.script.serde + org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + The default serde for trasmitting input data to and reading output data from the user scripts. + + + + hive.script.recordreader + org.apache.hadoop.hive.ql.exec.TextRecordReader + The default record reader for reading data from the user scripts. + + + + hive.script.recordwriter + org.apache.hadoop.hive.ql.exec.TextRecordWriter + The default record writer for writing data to the user scripts. + + + + hive.input.format + org.apache.hadoop.hive.ql.io.HiveInputFormat + The default input format, if it is not specified, the system assigns it. It is set to HiveInputFormat for hadoop versions 17, 18 and 19, whereas it is set to CombinedHiveInputFormat for hadoop 20. The user can always overwrite it - if there is a bug in CombinedHiveInputFormat, it can always be manually set to HiveInputFormat. + + + + hive.udtf.auto.progress + false + Whether Hive should automatically send progress information to TaskTracker when using UDTF's to prevent the task getting killed because of inactivity. Users should be cautious because this may prevent TaskTracker from killing tasks with infinte loops. + + + + hive.mapred.reduce.tasks.speculative.execution + true + Whether speculative execution for reducers should be turned on. + + + diff --git a/docs/src/site/twiki/DG_HiveActionExtension.twiki b/docs/src/site/twiki/DG_HiveActionExtension.twiki new file mode 100644 index 000000000..10b7e2ee2 --- /dev/null +++ b/docs/src/site/twiki/DG_HiveActionExtension.twiki @@ -0,0 +1,221 @@ + + +[[index][::Go back to Oozie Documentation Index::]] + +----- + +---+!! Oozie Hive Action Extension + +%TOC% + +#HiveAction +---++ Hive Action + +The =hive= action runs a Hive job. + +The workflow job will wait until the Hive job completes before +continuing to the next action. + +To run the Hive job, you have to configure the =hive= action with the +=job-tracker=, =name-node= and Hive =script= elements as +well as the necessary parameters and configuration. + +A =hive= action can be configured to create or delete HDFS directories +before starting the Hive job. + +Hive configuration can be specified with a file, using the =job-xml= +element, and inline, using the =configuration= elements. + +Oozie EL expressions can be used in the inline configuration. Property +values specified in the =configuration= element override values specified +in the =job-xml= file. + +Note that Hadoop =mapred.job.tracker= and =fs.default.name= properties +must not be present in the inline configuration. + +As with Hadoop =map-reduce= jobs, it is possible to add files and +archives in order to make them available to the Hive job. Refer to the +[WorkflowFunctionalSpec#FilesAchives][Adding Files and Archives for the Job] +section for more information about this feature. + +Oozie Hive action supports Hive scripts with parameter variables, their +syntax is =${VARIABLES}=. + +*Syntax:* + + + + ... + + + [JOB-TRACKER] + [NAME-NODE] + + + ... + + ... + + [HIVE SETTINGS FILE] + + + [PROPERTY-NAME] + [PROPERTY-VALUE] + + ... + + + [PARAM-VALUE] + ... + [PARAM-VALUE] + [FILE-PATH] + ... + [FILE-PATH] + ... + + + + + ... + + + +The =prepare= element, if present, indicates a list of paths to delete +or create before starting the job. Specified paths must start with =hdfs://HOST:PORT=. + +The =job-xml= element, if present, specifies a file containing configuration +for the Hive job. + +The =configuration= element, if present, contains configuration +properties that are passed to the Hive job. + +The =script= element must contain the path of the Hive script to +execute. The Hive script can be templatized with variables of the form +=${VARIABLE}=. The values of these variables can then be specified +using the =params= element. + +The =params= element, if present, contains parameters to be passed to +the Hive script. + +All the above elements can be parameterized (templatized) using EL +expressions. + +*Example:* + + + + ... + + + foo:9001 + bar:9000 + + + + + + mapred.compress.map.output + true + + + oozie.hive.defaults + /usr/foo/hive-0.6-default.xml + + + + InputDir=/home/tucu/input-data + OutputDir=${jobOutput} + + + + + ... + + + +---+++ Hive Default and Site Configuration Files + +All the properties defined in the =job-xml= and inline in the =configuration= +element become the =hive-site.xml= that Hive will use. + +Hive (as of Hive 0.6) does not yet include a =hive-default.xml= file, it is the responsibility +of the user to provide one. When using Oozie Hive action, the =hive-default.xml= file must be +copied to HDFS and in the Hive action =configuration= section it must be set in the =oozie.hive.defaults= +property. If a relative path is given, the path will be resolved within the workflow application +directory. + +*NOTE:* When Hive starts bundling a =hive-default.xml= file within its JARs, Oozie will ignore +the =hive-default.xml= file specified in the Hive action configuration. + +If a =hive-site.xml= file is not specified (or available in Hive JARs), the Oozie Hive action will fail. + +---+++ Hive Action Logging + +Hive action logs are redirected to the Oozie Launcher map-reduce job task STDOUT/STDERR that runs Hive. + +From Oozie web-console, from the Hive action pop up using the 'Console URL' link, it is possible +to navigate to the Oozie Launcher map-reduce job task logs via the Hadoop job-tracker web-console. + +The logging level of the Hive action can set in the Hive action configuration using the +property =oozie.hive.log.level=. The default value is =INFO=. + +---++ Appendix, Hive XML-Schema + +---+++ AE.A Appendix A, Hive XML-Schema + + + +. + +. + + + + + + + + + + + + + +. + + + + + + + + + + + + + +. + + + + + + +. + + + +. + + + +. + + + +[[index][::Go back to Oozie Documentation Index::]] + + diff --git a/docs/src/site/twiki/index.twiki b/docs/src/site/twiki/index.twiki index 247d14f36..717a1d26f 100644 --- a/docs/src/site/twiki/index.twiki +++ b/docs/src/site/twiki/index.twiki @@ -46,6 +46,10 @@ Enough reading already? Follow the steps in [[DG_QuickStart][Oozie Quick Start]] * [[DG_CustomActionExecutor][Writing a Custom Action Executor]] * [[./apidocs/index.html][Oozie Javadocs]] +---+++ Actions Extension + + * [[DG_HiveActionExtension][Hive Action]] + ---++ Administrator Documentation * [[AG_Install][Oozie Install]] diff --git a/examples/pom.xml b/examples/pom.xml index 23c0eb036..9dc3d5ba7 100644 --- a/examples/pom.xml +++ b/examples/pom.xml @@ -39,6 +39,12 @@ + + junit + junit + test + + com.yahoo.oozie oozie-core @@ -65,15 +71,13 @@ - - com.yahoo.hadoop + org.apache.hadoop hadoop-core provided - - com.yahoo.hadoop + org.apache.hadoop hadoop-test test @@ -84,6 +88,11 @@ test + + org.apache.derby + derby + compile + diff --git a/examples/src/main/apps/hive/job.properties b/examples/src/main/apps/hive/job.properties new file mode 100644 index 000000000..fc2745946 --- /dev/null +++ b/examples/src/main/apps/hive/job.properties @@ -0,0 +1,5 @@ +oozie.wf.application.path=hdfs://localhost:9000/user/${user.name}/examples/apps/hive +jobTracker=localhost:9001 +nameNode=hdfs://localhost:9000 +queueName=default +oozie.libpath=/user/${user.name}/examples/apps/examples-lib diff --git a/examples/src/main/apps/hive/my-hive-default.xml b/examples/src/main/apps/hive/my-hive-default.xml new file mode 100644 index 000000000..b5b001cb6 --- /dev/null +++ b/examples/src/main/apps/hive/my-hive-default.xml @@ -0,0 +1,434 @@ + + + + + + + + + + + + + + mapred.reduce.tasks + -1 + The default number of reduce tasks per job. Typically set + to a prime close to the number of available hosts. Ignored when + mapred.job.tracker is "local". Hadoop set this to 1 by default, whereas hive uses -1 as its default value. + By setting this property to -1, Hive will automatically figure out what should be the number of reducers. + + + + + hive.exec.reducers.bytes.per.reducer + 1000000000 + size per reducer.The default is 1G, i.e if the input size is 10G, it will use 10 reducers. + + + + hive.exec.reducers.max + 999 + max number of reducers will be used. If the one + specified in the configuration parameter mapred.reduce.tasks is + negative, hive will use this one as the max number of reducers when + automatically determine number of reducers. + + + + hive.exec.scratchdir + /tmp/hive-${user.name} + Scratch space for Hive jobs + + + + hive.test.mode + false + whether hive is running in test mode. If yes, it turns on sampling and prefixes the output tablename + + + + hive.test.mode.prefix + test_ + if hive is running in test mode, prefixes the output table by this string + + + + + + + + + + + hive.test.mode.samplefreq + 32 + if hive is running in test mode and table is not bucketed, sampling frequency + + + + hive.test.mode.nosamplelist + + if hive is running in test mode, dont sample the above comma seperated list of tables + + + + hive.metastore.local + true + controls whether to connect to remove metastore server or open a new metastore server in Hive Client JVM + + + + javax.jdo.option.ConnectionURL + jdbc:derby:;databaseName=metastore_db;create=true + JDBC connect string for a JDBC metastore + + + + javax.jdo.option.ConnectionDriverName + org.apache.derby.jdbc.EmbeddedDriver + Driver class name for a JDBC metastore + + + + javax.jdo.PersistenceManagerFactoryClass + org.datanucleus.jdo.JDOPersistenceManagerFactory + class implementing the jdo persistence + + + + javax.jdo.option.DetachAllOnCommit + true + detaches all objects from session so that they can be used after transaction is committed + + + + javax.jdo.option.NonTransactionalRead + true + reads outside of transactions + + + + javax.jdo.option.ConnectionUserName + APP + username to use against metastore database + + + + + + + + + + datanucleus.validateTables + false + validates existing schema against code. turn this on if you want to verify existing schema + + + + datanucleus.validateColumns + false + validates existing schema against code. turn this on if you want to verify existing schema + + + + datanucleus.validateConstraints + false + validates existing schema against code. turn this on if you want to verify existing schema + + + + datanucleus.storeManagerType + rdbms + metadata store type + + + + datanucleus.autoCreateSchema + true + creates necessary schema on a startup if one doesn't exist. set this to false, after creating it once + + + + datanucleus.autoStartMechanismMode + checked + throw exception if metadata tables are incorrect + + + + datancucleus.transactionIsolation + read-committed + + + + + datanuclues.cache.level2 + true + use a level 2 cache. turn this off if metadata is changed independently of hive metastore server + + + + datanuclues.cache.level2.type + SOFT + SOFT=soft reference based cache, WEAK=weak reference based cache. + + + + hive.metastore.warehouse.dir + /user/hive/warehouse + location of default database for the warehouse + + + + hive.metastore.connect.retries + 5 + Number of retries while opening a connection to metastore + + + + hive.metastore.rawstore.impl + org.apache.hadoop.hive.metastore.ObjectStore + Name of the class that implements org.apache.hadoop.hive.metastore.rawstore interface. This class is used to store and retrieval of raw metadata objects such as table, database + + + + hive.default.fileformat + TextFile + Default file format for CREATE TABLE statement. Options are TextFile and SequenceFile. Users can explicitly say CREATE TABLE ... STORED AS <TEXTFILE|SEQUENCEFILE> to override + + + + hive.fileformat.check + true + Whether to check file format or not when loading data files + + + + hive.map.aggr + true + Whether to use map-side aggregation in Hive Group By queries + + + + hive.groupby.skewindata + false + Whether there is skew in data to optimize group by queries + + + + hive.groupby.mapaggr.checkinterval + 100000 + Number of rows after which size of the grouping keys/aggregation classes is performed + + + + hive.mapred.local.mem + 0 + For local mode, memory of the mappers/reducers + + + + hive.map.aggr.hash.percentmemory + 0.5 + Portion of total memory to be used by map-side grup aggregation hash table + + + + hive.map.aggr.hash.min.reduction + 0.5 + Hash aggregation will be turned off if the ratio between hash + table size and input rows is bigger than this number. Set to 1 to make sure + hash aggregation is never turned off. + + + + hive.optimize.cp + true + Whether to enable column pruner + + + + hive.optimize.ppd + true + Whether to enable predicate pushdown + + + + hive.optimize.pruner + true + Whether to enable the new partition pruner which depends on predicate pushdown. If this is disabled, + the old partition pruner which is based on AST will be enabled. + + + + hive.optimize.groupby + true + Whether to enable the bucketed group by from bucketed partitions/tables. + + + + hive.join.emit.interval + 1000 + How many rows in the right-most join operand Hive should buffer before emitting the join result. + + + + hive.join.cache.size + 25000 + How many rows in the joining tables (except the streaming table) should be cached in memory. + + + + hive.mapjoin.bucket.cache.size + 100 + How many values in each keys in the map-joined table should be cached in memory. + + + + hive.mapjoin.maxsize + 100000 + Maximum # of rows of the small table that can be handled by map-side join. If the size is reached and hive.task.progress is set, a fatal error counter is set and the job will be killed. + + + + hive.mapjoin.cache.numrows + 25000 + How many rows should be cached by jdbm for map join. + + + + hive.mapred.mode + nonstrict + The mode in which the hive operations are being performed. In strict mode, some risky queries are not allowed to run + + + + hive.exec.script.maxerrsize + 100000 + Maximum number of bytes a script is allowed to emit to standard error (per map-reduce task). This prevents runaway scripts from filling logs partitions to capacity + + + + hive.exec.script.allow.partial.consumption + false + When enabled, this option allows a user script to exit successfully without consuming all the data from the standard input. + + + + + hive.script.operator.id.env.var + HIVE_SCRIPT_OPERATOR_ID + Name of the environment variable that holds the unique script operator ID in the user's transform function (the custom mapper/reducer that the user has specified in the query) + + + + + hive.exec.compress.output + false + This controls whether the final outputs of a query (to a local/hdfs file or a hive table) is compressed. The compression codec and other options are determined from hadoop config variables mapred.output.compress* + + + + hive.exec.compress.intermediate + false + This controls whether intermediate files produced by hive between multiple map-reduce jobs are compressed. The compression codec and other options are determined from hadoop config variables mapred.output.compress* + + + + hive.exec.parallel + false + Whether to execute jobs in parallel + + + + hive.hwi.war.file + lib/hive-hwi-0.5.0+20.war + This sets the path to the HWI war file, relative to ${HIVE_HOME}. + + + + hive.hwi.listen.host + 0.0.0.0 + This is the host address the Hive Web Interface will listen on + + + + hive.hwi.listen.port + 9999 + This is the port the Hive Web Interface will listen on + + + + hive.exec.pre.hooks + + Pre Execute Hook for Tests + + + + hive.merge.mapfiles + true + Merge small files at the end of a map-only job + + + + hive.merge.mapredfiles + false + Merge small files at the end of any job(map only or map-reduce) + + + + hive.heartbeat.interval + 1000 + Send a heartbeat after this interval - used by mapjoin and filter operators + + + + hive.merge.size.per.task + 256000000 + Size of merged files at the end of the job + + + + hive.script.auto.progress + false + Whether Hive Tranform/Map/Reduce Clause should automatically send progress information to TaskTracker to avoid the task getting killed because of inactivity. Hive sends progress information when the script is outputting to stderr. This option removes the need of periodically producing stderr messages, but users should be cautious because this may prevent infinite loops in the scripts to be killed by TaskTracker. + + + + hive.script.serde + org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + The default serde for trasmitting input data to and reading output data from the user scripts. + + + + hive.script.recordreader + org.apache.hadoop.hive.ql.exec.TextRecordReader + The default record reader for reading data from the user scripts. + + + + hive.script.recordwriter + org.apache.hadoop.hive.ql.exec.TextRecordWriter + The default record writer for writing data to the user scripts. + + + + hive.input.format + org.apache.hadoop.hive.ql.io.HiveInputFormat + The default input format, if it is not specified, the system assigns it. It is set to HiveInputFormat for hadoop versions 17, 18 and 19, whereas it is set to CombinedHiveInputFormat for hadoop 20. The user can always overwrite it - if there is a bug in CombinedHiveInputFormat, it can always be manually set to HiveInputFormat. + + + + hive.udtf.auto.progress + false + Whether Hive should automatically send progress information to TaskTracker when using UDTF's to prevent the task getting killed because of inactivity. Users should be cautious because this may prevent TaskTracker from killing tasks with infinte loops. + + + + hive.mapred.reduce.tasks.speculative.execution + true + Whether speculative execution for reducers should be turned on. + + + diff --git a/examples/src/main/apps/hive/script.q b/examples/src/main/apps/hive/script.q new file mode 100644 index 000000000..ae05aeee4 --- /dev/null +++ b/examples/src/main/apps/hive/script.q @@ -0,0 +1,2 @@ +CREATE EXTERNAL TABLE test (a INT) STORED AS TEXTFILE LOCATION '${INPUT}'; +INSERT OVERWRITE DIRECTORY '${OUTPUT}' SELECT * FROM test; diff --git a/examples/src/main/apps/hive/workflow.xml b/examples/src/main/apps/hive/workflow.xml new file mode 100644 index 000000000..26a1205a5 --- /dev/null +++ b/examples/src/main/apps/hive/workflow.xml @@ -0,0 +1,48 @@ + + + + + + + ${jobTracker} + ${nameNode} + + + + + + + mapred.job.queue.name + ${queueName} + + + oozie.hive.defaults + my-hive-default.xml + + + + INPUT=/user/${wf:user()}/examples/input-data/table + OUTPUT=/user/${wf:user()}/examples/output-data/hive + + + + + + + Hive failed, error message[${wf:errorMessage(wf:lastErrorNode())}] + + + diff --git a/examples/src/test/java/org/apache/oozie/example/TestLocalOozieExample.java b/examples/src/test/java/org/apache/oozie/example/TestLocalOozieExample.java index ee0bb38a7..d629f747b 100644 --- a/examples/src/test/java/org/apache/oozie/example/TestLocalOozieExample.java +++ b/examples/src/test/java/org/apache/oozie/example/TestLocalOozieExample.java @@ -52,7 +52,7 @@ protected void setUp() throws Exception { conf.set("fs.default.name", getNameNodeUri()); injectKerberosInfo(conf); -// TODO restore this when getting rid of DoAs trick + // TODO restore this when getting rid of DoAs trick // if (System.getProperty("oozie.test.kerberos", "off").equals("on")) { // Configuration c = new Configuration(); diff --git a/pom.xml b/pom.xml index 919fcc70f..ace14aec5 100644 --- a/pom.xml +++ b/pom.xml @@ -78,23 +78,50 @@ true + - central - http://repo1.maven.org/maven2 + Codehaus repository + http://repository.codehaus.org/ false - Codehaus repository - http://repository.codehaus.org/ + yahoo.github + http://yahoo.github.com/maven/repository + + true + + + + + datanucleus + http://www.datanucleus.org/downloads/maven2 + Datanucleus + + false + + + + java.net repository + http://download.java.net/maven/2 + + false + + + + + cdh.repo + https://repository.cloudera.com/content/groups/cloudera-repos + Cloudera Repositories false - yahoo.github - http://yahoo.github.com/maven/repository + cdh.snapshots.repo + https://repository.cloudera.com/content/repositories/snapshots + Cloudera Snapshots Repository true @@ -217,6 +244,12 @@ 1.2.1 + + commons-lang + commons-lang + 2.4 + + commons-codec commons-codec @@ -224,10 +257,11 @@ - - com.yahoo.hadoop + org.apache.hadoop hadoop-core - 0.20.104.2 + + + 0.20.2-CDH3B4 org.apache.commons @@ -237,10 +271,11 @@ - - com.yahoo.hadoop + org.apache.hadoop hadoop-test - 0.20.104.2 + + + 0.20.2-CDH3B4 org.apache.commons @@ -262,10 +297,11 @@ - - com.yahoo.hadoop + org.apache.hadoop hadoop-streaming - 0.20.104.2 + + + 0.20.2-CDH3B4 org.apache.commons @@ -275,17 +311,12 @@ - - com.yahoo.hadoop + org.apache.pig pig - 0.7.0 + + + 0.8.0-CDH3B4 - - - com.yahoo.hadoop - hadoop-core - - org.apache.hadoop hadoop-core @@ -305,6 +336,18 @@ + + org.apache.thrift + thrift + 0.5.0-cdh + + + + org.apache.hadoop.hive + hive-cli + 0.7.0-CDH3B4 + + org.slf4j slf4j-log4j12 @@ -478,6 +521,13 @@ 2.0-beta-6 + + + org.apache.maven.doxia + doxia-module-confluence + 1.0-alpha-11 + + org.apache.maven.plugins @@ -540,6 +590,8 @@ 1.6 1.6 + + -proc:none @@ -547,7 +599,7 @@ maven-surefire-plugin ${oozie.test.forkMode} - -Xmx1024m + -Xmx1024m -da /tmp diff --git a/release-log.txt b/release-log.txt index fe2e273df..1b4ace6cb 100644 --- a/release-log.txt +++ b/release-log.txt @@ -1,5 +1,7 @@ -- Oozie 3.0.0 release +GH-0022 Add Hive Action +GH-0226 Standardize on groupId/artifactId for Hadoop/Pig/Oozie (using CDH artifacts) GH-0542 Refactor CoordSuspendXCommand & CoordResumeXCommand to TransitionXCommand based GH-0543 Refactor CoordRerunXCommand to TransitionXCommand based GH-0530 Update workflow rerun twiki. diff --git a/sharelib/pom.xml b/sharelib/pom.xml index b29a76ba2..25d005935 100644 --- a/sharelib/pom.xml +++ b/sharelib/pom.xml @@ -39,35 +39,77 @@ - - com.yahoo.hadoop + org.apache.hadoop hadoop-streaming compile - - com.yahoo.hadoop + org.apache.hadoop hadoop-core - - com.yahoo.hadoop + org.apache.pig pig compile - - com.yahoo.hadoop + org.apache.hadoop hadoop-core + + junit + junit + - jline - jline + org.apache.hadoop.hive + hive-cli compile + + + org.apache.hadoop.hive + hive-hwi + + + org.apache.hadoop.hive + hive-jdbc + + + org.apache.hadoop.hive + hive-anttasks + + + junit + junit + + + log4j + log4j + + + commons-codec + commons-codec + + + commons-logging + commons-logging + + + commons-logging + commons-logging-api + + + org.apache.hadoop + hadoop-core + + + org.apache.derby + derby + + diff --git a/src/main/assemblies/client.xml b/src/main/assemblies/client.xml index 75aad515d..6e7a8c497 100644 --- a/src/main/assemblies/client.xml +++ b/src/main/assemblies/client.xml @@ -53,6 +53,7 @@ compile javax.persistence:persistence-api + *:*:pom diff --git a/src/main/assemblies/sharelib.xml b/src/main/assemblies/sharelib.xml index e1a6440c3..86f5b7c85 100644 --- a/src/main/assemblies/sharelib.xml +++ b/src/main/assemblies/sharelib.xml @@ -33,6 +33,7 @@ false ${project.groupId}:oozie-sharelib + *:*:pom diff --git a/webapp/pom.xml b/webapp/pom.xml index 1ae21fc26..f7f10c503 100644 --- a/webapp/pom.xml +++ b/webapp/pom.xml @@ -134,7 +134,7 @@ - com.yahoo.hadoop + org.apache.hadoop hadoop-core compile