spark读取hbase数据做分布式计算

jopen 12年前

由于spark提供的hbaseTest是scala版本，并没有提供java版。我将scala版本改为java版本，并根据数据做了些计算操作。

程序目的：查询出hbase满足条件的用户，统计各个等级个数。

代码如下，注释已经写详细：

package com.sdyc.ndspark.sys;    import org.apache.commons.logging.Log;  import org.apache.commons.logging.LogFactory;  import org.apache.hadoop.conf.Configuration;  import org.apache.hadoop.hbase.HBaseConfiguration;  import org.apache.hadoop.hbase.client.Result;  import org.apache.hadoop.hbase.client.Scan;  import org.apache.hadoop.hbase.io.ImmutableBytesWritable;  import org.apache.hadoop.hbase.mapreduce.TableInputFormat;  import org.apache.hadoop.hbase.util.Base64;  import org.apache.hadoop.hbase.util.Bytes;  import org.apache.spark.api.java.JavaPairRDD;  import org.apache.spark.api.java.JavaSparkContext;  import org.apache.spark.api.java.function.Function2;  import org.apache.spark.api.java.function.PairFunction;  import scala.Tuple2;    import java.io.ByteArrayOutputStream;  import java.io.DataOutputStream;  import java.io.IOException;  import java.io.Serializable;  import java.util.List;    /**   * <pre>   *   * spark hbase 测试   *   *  Created with IntelliJ IDEA.   * User: zhangdonghao   * Date: 14-1-26   * Time: 上午9:24   * To change this template use File | Settings | File Templates.   * </pre>   *   * @author zhangdonghao   */  public class HbaseTest implements Serializable {        public Log log = LogFactory.getLog(HbaseTest.class);        /**       * 将scan编码，该方法copy自 org.apache.hadoop.hbase.mapreduce.TableMapReduceUtil       *       * @param scan       * @return       * @throws IOException       */      static String convertScanToString(Scan scan) throws IOException {          ByteArrayOutputStream out = new ByteArrayOutputStream();          DataOutputStream dos = new DataOutputStream(out);          scan.write(dos);          return Base64.encodeBytes(out.toByteArray());      }        public void start() {          //初始化sparkContext，这里必须在jars参数里面放上Hbase的jar，          // 否则会报unread block data异常          JavaSparkContext sc = new JavaSparkContext("spark://nowledgedata-n3:7077", "hbaseTest",                  "/home/hadoop/software/spark-0.8.1",                  new String[]{"target/ndspark.jar", "target\\dependency\\hbase-0.94.6.jar"});            //使用HBaseConfiguration.create()生成Configuration          // 必须在项目classpath下放上hadoop以及hbase的配置文件。          Configuration conf = HBaseConfiguration.create();          //设置查询条件，这里值返回用户的等级          Scan scan = new Scan();          scan.setStartRow(Bytes.toBytes("195861-1035177490"));          scan.setStopRow(Bytes.toBytes("195861-1072173147"));          scan.addFamily(Bytes.toBytes("info"));          scan.addColumn(Bytes.toBytes("info"), Bytes.toBytes("levelCode"));            try {              //需要读取的hbase表名              String tableName = "usertable";              conf.set(TableInputFormat.INPUT_TABLE, tableName);              conf.set(TableInputFormat.SCAN, convertScanToString(scan));                //获得hbase查询结果Result              JavaPairRDD<ImmutableBytesWritable, Result> hBaseRDD = sc.newAPIHadoopRDD(conf,                      TableInputFormat.class, ImmutableBytesWritable.class,                      Result.class);                //从result中取出用户的等级，并且每一个算一次              JavaPairRDD<Integer, Integer> levels = hBaseRDD.map(                      new PairFunction<Tuple2<ImmutableBytesWritable, Result>, Integer, Integer>() {                          @Override                          public Tuple2<Integer, Integer> call(                                  Tuple2<ImmutableBytesWritable, Result> immutableBytesWritableResultTuple2)                                  throws Exception {                              byte[] o = immutableBytesWritableResultTuple2._2().getValue(                                      Bytes.toBytes("info"), Bytes.toBytes("levelCode"));                              if (o != null) {                                  return new Tuple2<Integer, Integer>(Bytes.toInt(o), 1);                              }                              return null;                          }                      });                //数据累加              JavaPairRDD<Integer, Integer> counts = levels.reduceByKey(new Function2<Integer, Integer, Integer>() {                  public Integer call(Integer i1, Integer i2) {                      return i1 + i2;                  }              });                            //打印出最终结果              List<Tuple2<Integer, Integer>> output = counts.collect();              for (Tuple2 tuple : output) {                  System.out.println(tuple._1 + ": " + tuple._2);              }            } catch (Exception e) {              log.warn(e);          }        }        /**       * spark如果计算没写在main里面,实现的类必须继承Serializable接口，<br>       * </>否则会报 Task not serializable: java.io.NotSerializableException 异常       */      public static void main(String[] args) throws InterruptedException {            new HbaseTest().start();            System.exit(0);      }  }

运行结果如下：

0: 28528  11: 708  4: 28656  2: 36315  6: 23848  8: 19802  10: 6913  9: 15988  3: 31950  1: 38872  7: 21600  5: 27190  12: 17

spark读取hbase数据做分布式计算

相关经验

目录