
As Hive do not provide any way to get row count of all tables in a single statement, people generally have to write some custom script to get the result. Here we will use Spark to get row count of all tables in a particular Hive DB. We will sent Hive DB name from the command line argument on Spark submit statement.
Spark Code:
package com.tjman.exp1
import org.apache.spark.sql.SparkSession
object HiveTableRecordCount {
def main(args: Array[String]) {
val spark = SparkSession.builder().appName("HiveTableRecordCount").enableHiveSupport().getOrCreate()
spark.sql("use "+args(0)) //args(0) is hive db name passed from command line
val tableDf = spark.sql("show tables").collect()
var queryBuilder = ""
var i = 1
//Consolidated count query builder
tableDf.foreach { row =>
{
queryBuilder = queryBuilder + (s"""select '""") + row.getString(1) + (s"""' TABLE_NAME,count(*) COUNT from """) + row.getString(1)
if (i < tableDf.size) {
queryBuilder = queryBuilder + "\nunion all \n"
}
i = i + 1
}
}
//print(queryBuilder)
spark.sql(queryBuilder).show
spark.stop()
}
}
Spark Submit:
spark-submit --master yarn --class com.tjman.exp1.HiveTableRecordCount HiveTableRecordCount.jar <hive db name>
Result:
19/04/13 01:57:16 INFO YarnScheduler: Removed TaskSet 23.0, whose tasks have all completed, from pool
19/04/13 01:57:16 INFO DAGScheduler: ResultStage 23 (show at HiveTableRecordCount.scala:23) finished in 0.081 s
19/04/13 01:57:16 INFO DAGScheduler: Job 2 finished: show at HiveTableRecordCount.scala:23, took 0.085210 s
+--------------------+-----+
| TABLE_NAME|COUNT|
+--------------------+-----+
|adminlogindata_im...| 30|
|adminlogindata_pa...| 28|
|adminlogindata_pa...| 0|
|adminlogindata_pa...| 28|
| datatbl| 1111|
| datatbl_orc| 1111|
| datatblnew| 1111|
+--------------------+-----+
19/04/13 01:57:16 INFO SparkUI: Stopped Spark web UI at http://host1.tunetotech.com:10056
19/04/13 01:57:17 INFO YarnClientSchedulerBackend: Interrupting monitor thread