package icu.wuhufly.features

import icu.wuhufly.SparkHandler
import org.apache.spark.SparkContext
import org.apache.spark.sql.functions.{current_timestamp, lit}
import org.apache.spark.sql.types.{DoubleType, LongType, StructField, StructType, TimestampType}
import org.apache.spark.sql.{DataFrame, Dataset, Row, SparkSession}
import org.dom4j.{DocumentHelper, Element}

import java.sql.Timestamp
import java.util
import scala.collection.JavaConverters
import scala.util.Try

object Parse01 {
  def main(args: Array[String]): Unit = {
    val handler: SparkHandler = SparkHandler.of()
    val spark: SparkSession = handler.getSpark()
    val sc: SparkContext = spark.sparkContext
    spark.sql("use hudi_gy_dwd")
    import spark.implicits._

    val df: Dataset[(Double, Double, Double, Array[Double], Timestamp)] = handler.readFromHDFS(
      "fact_machine_data", spark, "hudi_gy_dwd"
    )
      .selectExpr(
        "cast(MachineRecordID as double) as machine_record_id",
        "cast(MachineID as double) as machine_id",
        "cast(if(MachineRecordState='报警', 1, 0) as double) as machine_record_state",
        "MachineRecordData",
        "to_timestamp(MachineRecordDate, 'yyyy-MM-dd HH:mm:ss') as machine_record_date"
      )
      .as[(Double, Double, Double, String, Timestamp)]
      .repartition(24)
      .mapPartitions((iter: Iterator[(Double, Double, Double, String, Timestamp)]) => {
        def parse(data: String): Array[Double] = {
          val root: Element = DocumentHelper.parseText("<r>%s</r>".format(data)).getRootElement

          val map: Map[String, String] = JavaConverters.asScalaBuffer(root.elements())
            .map(e => (e.attribute(0).getText, e.getText))
            .toMap
          cnCols.map(s => map.getOrElse(s, "0.0"))
            .map(s => Try(s.toDouble).getOrElse(0.0))
        }

        iter.map(t => {
          (t._1, t._2, t._3, parse(t._4), t._5)
        })
      })

//    df.show(5, false)

    val schema: StructType = StructType.apply(
      Array("machine_record_id", "machine_id", "machine_record_state").map(s => StructField.apply(s, DoubleType))
        .union(
          enCols.map(s => StructField.apply(s, DoubleType))
        )
        .union(
        Array(StructField.apply("machine_record_date", TimestampType))
      )
    )

    val arr = new util.ArrayList[Row]()
    implicit val encoder = spark.createDataFrame(arr, schema)
      .encoder

    val resDF: DataFrame = df.map(t => Row.apply((Array(t._1, t._2, t._3).union(t._4).union(Array(t._5))): _*))
      .withColumn("machine_record_id", $"machine_record_id".cast(LongType))
      .withColumn("dwd_insert_time", current_timestamp())
      .withColumn("dwd_insert_user", lit("user1"))
      .withColumn("dwd_modify_time", current_timestamp())
      .withColumn("dwd_modify_user", lit("user1"))

    handler.writeIntoHDFS("fact_machine_learning_data", resDF, "machine_record_id",
      "machine_record_date", "machine_id", "hudi_gy_dwd")

//    df.map(t => {
//      (
//        t._1,t._2,t._3,
//        t._4(0),
//        t._4(1),
//        t._4(2),
//        t._4(3),
//        t._4(4),
//        t._4(5),
//        t._4(6),
//        t._4(7),
//        t._4(8),
//        t._4(9),
//        t._4(10),
//        t._4(11),
//        t._4(12),
//        t._4(13),
//        t._4(14),
//        t._5
//      )
//    })
//      .show(5, false)

    sc.stop()
  }

  val cnCols = Array("主轴转速", "主轴倍率", "主轴负载", "进给倍率", "进给速度", "PMC程序号", "循环时间", "运行时间", "有效轴数", "总加工个数", "已使用内存", "未使用内存", "可用程序量", "注册程序量")
  val enCols = Array("machine_record_mainshaft_speed", "machine_record_mainshaft_multiplerate", "machine_record_mainshaft_load", "machine_record_feed_speed", "machine_record_feed_multiplerate", "machine_record_pmc_code", "machine_record_circle_time", "machine_record_run_time", "machine_record_effective_shaft", "machine_record_amount_process", "machine_record_use_memory", "machine_record_free_memory", "machine_record_amount_use_code", "machine_record_amount_free_code")
}
