From dfcc5b465863752932713a69e52902ed6968722c Mon Sep 17 00:00:00 2001 From: Evan Samanas Date: Thu, 12 Feb 2015 15:28:33 -0800 Subject: [PATCH] Add HDFS how-to 1. Shows basically how to read from HDFS 2. Shows what the HDFS URL looks like, including server name and port. Adds a CDH5 compliant way of getting that info. 3. Tells you how to set which java implementation the HDFS client will use --- README.md | 1 + load_hdfs.py | 35 +++++++++++++++++++++++++++++++++++ 2 files changed, 36 insertions(+) create mode 100644 load_hdfs.py diff --git a/README.md b/README.md index e1bfc9a..36184b4 100644 --- a/README.md +++ b/README.md @@ -16,6 +16,7 @@ Data Ingress * [Load a JSON file into an SFrame](load_json.py) * [Load a collection XML files into an SFrame](sframe_xml_to_dict.py) * [Load an Avro file into an SFrame](load_avro.py) +* [Load a file on HDFS into an SFrame](load_hdfs.py) Tabular Data Transformation ----------------------------- diff --git a/load_hdfs.py b/load_hdfs.py new file mode 100644 index 0000000..a37bcd3 --- /dev/null +++ b/load_hdfs.py @@ -0,0 +1,35 @@ +import os +import subprocess +import graphlab as gl + +# Reading from HDFS into an SFrame is easy, as long as you know how to +# construct your HDFS URL and your system has java installed in a relatively +# standard way. This how-to is meant to help if one of those two things are +# not true for you. + +#### Installation-specific variables #### +# Change these variables for your HDFS setup + +hdfs_url_base = None +# An example of what should be in this variable +#hdfs_url_base = 'hdfs://my.server.com:8020' + +username = 'evan' + +filepath = 'test.txt' + +#### Construct your HDFS URL #### +# If you don't know how to get the server and port to reach your HDFS +# installation, here's a way to do it that works on CDH 5. +if hdfs_url_base is None: + hdfs_url_base = subprocess.check_output( + ['hdfs', 'getconf', '-confKey', 'fs.defaultFS']).rstrip() + +#### Specify a Java installation (OPTIONAL) #### +# To set a specific java implementation to execute the HDFS commands, set this +# environment variable BEFORE running any GraphLab Create commands. +os.environ['GRAPHLAB_JAVA_HOME'] = '/foo/java' + +sf = gl.SFrame.read_csv(hdfs_url_base + '/user/' + username + '/' + filepath); + +print sf