scala - 使用 Spark 批量加载到 Phoenix

标签 scala apache-spark hbase phoenix

我试图编写一些实用程序来通过来自 Spark RDD 的 HFile 批量加载数据。

我正在从 phoenix 获取 CSVBulkLoadTool 的模式。我设法生成了一些 HFile 并将它们加载到 HBase 中,但是我看不到使用 sqlline 的行(例如,使用 hbase shell 是可能的)。对于任何建议,我将不胜感激。

BulkPhoenixLoader.scala:

class BulkPhoenixLoader[A <: ImmutableBytesWritable : ClassTag, T <: KeyValue : ClassTag](rdd: RDD[(A, T)]) {

  def createConf(tableName: String, inConf: Option[Configuration] = None): Configuration = {
    val conf = inConf.map(HBaseConfiguration.create).getOrElse(HBaseConfiguration.create())
    val job: Job = Job.getInstance(conf, "Phoenix bulk load")

    job.setMapOutputKeyClass(classOf[ImmutableBytesWritable])
    job.setMapOutputValueClass(classOf[KeyValue])

    // initialize credentials to possibily run in a secure env
    TableMapReduceUtil.initCredentials(job)

    val htable: HTable = new HTable(conf, tableName)

    // Auto configure partitioner and reducer according to the Main Data table
    HFileOutputFormat2.configureIncrementalLoad(job, htable)
    conf
  }

  def bulkSave(tableName: String, outputPath: String, conf:
  Option[Configuration]) = {
    val configuration: Configuration = createConf(tableName, conf)
    rdd.saveAsNewAPIHadoopFile(
      outputPath,
      classOf[ImmutableBytesWritable],
      classOf[Put],
      classOf[HFileOutputFormat2],
      configuration)
  }

}

扩展产品RDDFunctions.scala:
class ExtendedProductRDDFunctions[A <: scala.Product](data: org.apache.spark.rdd.RDD[A]) extends
ProductRDDFunctions[A](data) with Serializable {

  def toHFile(tableName: String,
              columns: Seq[String],
              conf: Configuration = new Configuration,
              zkUrl: Option[String] =
              None): RDD[(ImmutableBytesWritable, KeyValue)] = {

    val config = ConfigurationUtil.getOutputConfiguration(tableName, columns, zkUrl, Some(conf))

    val tableBytes = Bytes.toBytes(tableName)
    val encodedColumns = ConfigurationUtil.encodeColumns(config)
    val jdbcUrl = zkUrl.map(getJdbcUrl).getOrElse(getJdbcUrl(config))

    val conn = DriverManager.getConnection(jdbcUrl)

    val query = QueryUtil.constructUpsertStatement(tableName,
      columns.toList.asJava,
      null)
    data.flatMap(x => mapRow(x, jdbcUrl, encodedColumns, tableBytes, query))
  }

  def mapRow(product: Product,
             jdbcUrl: String,
             encodedColumns: String,
             tableBytes: Array[Byte],
             query: String): List[(ImmutableBytesWritable, KeyValue)] = {

    val conn = DriverManager.getConnection(jdbcUrl)
    val preparedStatement = conn.prepareStatement(query)

    val columnsInfo = ConfigurationUtil.decodeColumns(encodedColumns)
    columnsInfo.zip(product.productIterator.toList).zipWithIndex.foreach(setInStatement(preparedStatement))
    preparedStatement.execute()

    val uncommittedDataIterator = PhoenixRuntime.getUncommittedDataIterator(conn, true)
    val hRows = uncommittedDataIterator.asScala.filter(kvPair =>
      Bytes.compareTo(tableBytes, kvPair.getFirst) == 0
    ).flatMap(kvPair => kvPair.getSecond.asScala.map(
      kv => {
        val byteArray = kv.getRowArray.slice(kv.getRowOffset, kv.getRowOffset + kv.getRowLength - 1) :+ 1.toByte
        (new ImmutableBytesWritable(byteArray, 0, kv.getRowLength), kv)
      }))

    conn.rollback()
    conn.close()
    hRows.toList
  }

  def setInStatement(statement: PreparedStatement): (((ColumnInfo, Any), Int)) => Unit = {
    case ((c, v), i) =>
      if (v != null) {
        // Both Java and Joda dates used to work in 4.2.3, but now they must be java.sql.Date
        val (finalObj, finalType) = v match {
          case dt: DateTime => (new Date(dt.getMillis), PDate.INSTANCE.getSqlType)
          case d: util.Date => (new Date(d.getTime), PDate.INSTANCE.getSqlType)
          case _ => (v, c.getSqlType)
        }
        statement.setObject(i + 1, finalObj, finalType)
      } else {
        statement.setNull(i + 1, c.getSqlType)
      }
  }

  private def getIndexTables(conn: Connection, qualifiedTableName: String) : List[(String, String)]
  = {
    val table: PTable = PhoenixRuntime.getTable(conn, qualifiedTableName)
    val tables = table.getIndexes.asScala.map(x => x.getIndexType match {
      case IndexType.LOCAL => (x.getTableName.getString, MetaDataUtil.getLocalIndexTableName(qualifiedTableName))
      case _ => (x.getTableName.getString, x.getTableName.getString)
    }).toList
    tables
  }


}

我使用 hbase 中的实用工具加载生成的 HFiles,如下所示:
hbase org.apache.hadoop.hbase.mapreduce.LoadIncrementalHFiles path/to/hfile tableName

最佳答案

您可以将 csv 文件转换为 Product 的 RDD 并使用 .saveToPhoenix 方法。这通常是我将 csv 数据加载到 phoenix 的方式。

请参阅:https://phoenix.apache.org/phoenix_spark.html

关于scala - 使用 Spark 批量加载到 Phoenix,我们在Stack Overflow上找到一个类似的问题: https://stackoverflow.com/questions/30685700/

相关文章:

java - 面向 .NET 开发人员的 Scala IDE

mysql - 由 : java. io.NotSerializableException : org. apache.spark.SparkContext 引起 - 在 Spark 中使用 JdbcRDD 时

java - 如何使用 Java 将 BigQuery 读取到 Apache Spark

indexing - HBase 访问和索引

hadoop - 如何使用条件(Where 子句)从 HBase 执行简单选择

hadoop - HBase:刷新后,HFile统计信息未更改

Scala Spark - 丢弃空键

scala - 将Scala Iterable [tuple]转换为RDD

java - 与 Java 相比,Scala 对于泛型和通配符有哪些机制?

java - 将用户定义的对象转换为数据帧并写入 RDBMS - 如何维护与数据库的映射?