// Copyright 2025 by Carnegie Mellon University
// See license information in LICENSE.txt

package org.cert.netsa.mothra.tools

import java.lang.management.ManagementFactory
import java.util.concurrent.{Executors, LinkedBlockingQueue, ThreadPoolExecutor, TimeUnit}
import scala.collection.immutable.Queue
import scala.collection.mutable.{Map => MutMap, Set}
import scala.util.{Failure, Success, Try}
import scala.util.control.NonFatal
import scala.util.matching.Regex

import com.typesafe.scalalogging.StrictLogging
import org.apache.hadoop.conf.Configuration
import org.apache.hadoop.fs.{LocatedFileStatus, RemoteIterator}
import org.apache.hadoop.fs.{Path => HPath}
import org.apache.hadoop.io.compress.CompressionCodecFactory
import org.cert.netsa.io.ipfix.{
  ArrayRecord, BasicList, DataTypes, ExportSubTemplateList, ExportSubTemplateMultiList, InfoElement,
  InfoModel, Record, Session, SubTemplateList, SubTemplateMultiList, Template
}
import org.cert.netsa.mothra.packer.{PackerThreadFactory, Reader, Version, Writer}

import resource.managed // see http://jsuereth.com/scala-arm/index.html

/** Object to implement the FileSanitizer application.
  *
  * Typical Usage in a Spark environment:
  *
  * `spark-submit --class org.cert.netsa.mothra.packer.tools.FileSanitizerMain mothra-tools.jar
  * <f1>[,<f2>[,<f3>...]] <s1> [<s2> <s3> ...]`
  *
  * where:
  *
  * f1..fn: Names of InfoElements to be removed from the files s1..sn: Directories to process, as
  * Hadoop URIs
  *
  * FileSanitizer removes Information Element fields from the data files in a Mothra repository. In
  * addition, when multiple files share the same name except for the UUID, FileSanitizer combines
  * those files together.
  *
  * The IE fields to be removed must be specified in a single argument, as a comma-separated list of
  * names, such as `sourceTransportPort,destinationTransportPort`.
  *
  * Each remaining argument is a single directory to process.
  *
  * FileSanitizer runs as a batch process, not as a daemon.
  *
  * FileSanitizer makes a single recursive scan of the source directories <s1>, <s2>, ... for files
  * whose names match the pattern "YYYYMMDD.HH." or "YYYYMMDD.HH-PTddH." (It looks for files
  * matching the regular expression `^\d{8}\.\d{2}(?:-PT\d\d?H)?\.`) Files whose names match that
  * pattern are processed by FileSanitizer to remove the named Information Elements. All files where
  * the regular expression matched the same string are joined into a single file, similar to the
  * FileJoiner. Finally, the original files are removed.
  *
  * There is always a single thread that recursively scans the directories. The number of threads
  * that sanitizes and joins the files may be set by specifying the
  * `mothra.filesanitizer.maxThreads` Java property. If not specified, the default is 6.
  *
  * FileSanitizer may be run so that either it spawns a thread for every directory that contains
  * files to process or it spawns a thread for each set of files in a directory that have the same
  * prefix. The behavior is controlled whether the `mothra.filesanitizer.spawnThread` Java property
  * is set to `by-prefix` or `by-directory`. The default is `by-directory`. (For backwards
  * compatibility, `by-hour` is an alias for `by-prefix`.)
  *
  * By default, FileSanitizer does not compress the files it writes. (NOTE: It should support
  * writing the output using the same compression as the input.) To specify the compression codec
  * that it should use, specify the `mothra.filesanitizer.compression` Java property. Values
  * typically supported by Hadoop include `bzip2`, `gzip`, `lz4`, `lzo`, `lzop`, `snappy`, and
  * `default`. The empty string indicates no compression.
  *
  * FileSanitizer joins the files sharing the same prefix into a single file by default. The
  * `mothra.filesanitizer.maximumSize` Java property may be used to limit the maximum file size. The
  * size is for the compressed file if compression is active. The value is approximate since it is
  * only checked after the data appears on disk which occurs in large blocks because of buffering by
  * the Java stream code and the compression algorithm.
  */
object FileSanitizerMain extends App with StrictLogging {

  def usage(full: Boolean = false): Unit = {
    print("""
Usage: spark-submit --class org.cert.netsa.mothra.packer.tools.FileSanitizerMain mothra-tools.jar <f1>[,<f2>[,<f3>...]] <s1> [<s2> <s3> ...]

f1..fn:         Names of InfoElements to be removed from the files
s1..sn:         Directories to process, as Hadoop URIs
""")
    if (full) {
      print(s"""
FileSanitizer removes Information Element fields from the hourly files in a
Mothra repository.  In addition, when multiple files share the same name
except for the UUID, FileSanitizer combines those files together.

The IE fields to be removed must be specified in a single argument, as a
comma-separated list of names, such as
'sourceTransportPort,destinationTransportPort'.

Each remaining argument is a single directory to process.

FileSanitizer runs as a batch process, not as a daemon.

FileSanitizer makes a single recursive scan of the source directories
<s1>, <s2>, ... for files whose names match the pattern "YYYYMMDD.HH." or
"YYYYMMDD.HH-PTddH." (It looks for files matching the regular expression
`^\\d{8}\\.\\d{2}(?:PTddH)?\\.`) Files whose names match that pattern are
processed by FileSanitizer to remove the named Information Elements.  All
files where the regular expression matched the same string are joined into a
single file, similar to the FileJoiner.  Finally, the original files are
removed.

There is always a single thread that recursively scans the directories.
The number of threads that sanitizes and joins the files may be set by
specifying the `mothra.filesanitizer.maxThreads` Java property.  If not
specified, the default is $DEFAULT_MAX_THREADS.

FileSanitizer may be run so that either it spawns a thread for every
directory that contains files to process or it spawns a thread for each
set of files in a directory that have the same prefix.  The behavior is
controlled whether the `mothra.filesanitizer.spawnThread` Java property is
set to `by-prefix` or `by-directory`.  The default is `$DEFAULT_SPAWN_THREAD`.
(For backwards compatibility, `by-hour` is an alias for `by-prefix`.)

By default, FileSanitizer does not compress the files it writes.
(NOTE: It should support writing the output using the same compression as
the input.)  To specify the compression codec that it should use, specify
the `mothra.filesanitizer.compression` Java property.  Values typically
supported by Hadoop include `bzip2`, `gzip`, `lz4`, `lzo`, `lzop`,
`snappy`, and `default`.  The empty string indicates no compression.

FileSanitizer joins files sharing the same prefix into a single file
by default.  The `mothra.filesanitizer.maximumSize` Java property may be
used to limit the maximum file size.  The size is for the compressed file
if compression is active.  The value is approximate since it is only
checked after the data appears on disk which occurs in large blocks
because of buffering by the Java stream code and the compression algorithm.
""")
    }
    System.exit(
      if (full) {
        0
      } else {
        1
      }
    )
  }

  def version(): Unit = {
    println(s"FileSanitizer ${Version.get()}")
    System.exit(0)
  }

  //  <<<<<  DEFAULT VALUES FOR PROPERTY SETTINGS  >>>>>  //

  /** The default compression codec to use for files written to HDFS. This may be modified by
    * specifying the following property: mothra.filesanitizer.compression.
    *
    * Values typically supported by Hadoop include `bzip2`, `gzip`, `lz4`, `lzo`, `lzop`, `snappy`,
    * and `default`. The empty string indicates no compression.
    */
  val DEFAULT_COMPRESSION = ""

  /** The default number of threads to run for sanitizing files when the
    * `mothra.filesanitizer.maxThreads` Java property is not set. (The scanning task always runs in
    * its own thread.)
    */
  val DEFAULT_MAX_THREADS = 6

  /** The default value for `spawnThread` when the `mothra.filesanitizer.spawnThread` Java property
    * is not specified.
    */
  val DEFAULT_SPAWN_THREAD = "by-directory"

  //  <<<<<  PROCESS THE COMMAND LINE ARGUMENTS  >>>>>  //

  val (switches, positionalArgs) = args.partition {
    _.substring(0, 1) == "-"
  }

  switches.collect {
    case "-V" | "--version" => version()
    case "-h" | "--help"    => usage(true)
    case unknown: String =>
      println(s"Unknown argument '$unknown'")
      usage()
  }

  if (positionalArgs.length < 2) {
    logger.error(s"Must specify at least two arguments: IE[,IE...] DIR [DIR...]")
    usage()
  }

  logger.info(
    "\n============================= FileSanitizer is starting =============================\n"
  )
  logger.info(s"This is FileSanitizer ${Version.get()}")

  /** The Hadoop configuration */
  implicit val hadoopConf: Configuration = new Configuration()

  /** The information model */
  implicit val infoModel: InfoModel = InfoModel.getCERTStandardInfoModel()

  // the first argument is the name(s) of IE(s) to remove
  val toRemove = Set.empty[InfoElement] ++
    (for {
      ieName <- positionalArgs(0).split(",");
      if ieName != "";
      ie <- infoModel
        .get(ieName)
        .orElse {
          logger.error(s"Unknown IE name '$ieName'")
          throw new RuntimeException(s"Unknown IE name '$ieName'")
        }
    } yield ie)

  // the remaining argument(s) is/are the directory(s) to scan
  private var dirList = positionalArgs.tail.toList.map(new HPath(_))

  //  <<<<<  DEFINE SOME HELPER CLASSES  >>>>>  //

  /** A class that wraps opening a file for writing: generating a file name, opening the file,
    * enabling compression, and creating a org.cert.netsa.io.ExportStream.
    */
  private class SanitizingWriter(dir: HPath, basename: String)
      extends Writer(dir, basename, compressCodec, maximumSize)(infoModel, hadoopConf) {
    // import from the companion object
    import SanitizingWriter.{dummyTemplate, RecordNeeded, Transcoder}

    /** Maps a source Template to Transcoder which contains the destination template and a mapping
      * between the Templates.
      */
    private object TranscoderMap {
      private var tmplMap = Map.empty[Template, Transcoder]

      /** Utility method to check the tmplMap for an existing Transcoder or to create a new one if
        * it does not exist, add it to the map, add its template to the exportSession, and return
        * it.
        */
      def get(tmpl: Template, rec: Option[Record], sess: Session): Transcoder = {
        tmplMap.get(tmpl) match {
          case Some(tc) => tc
          case None =>
            val tc = Transcoder(tmpl, rec)
            tmplMap = tmplMap.updated(tmpl, tc)
            exportSession.getOrAdd(tc.template, sess.getId(tmpl))
            tc
        }
      }
    }

    /** Sanitizes a Record, given also its input Session and a Transcoder mapping, to remove from
      * the Record and its sub-lists the IEs that appear in `toRemove`. Returns a new ArrayRecord as
      * an Option. Returns None when all fields of the Record have been removed.
      */
    private def sanitizeRecord(
      rec: Record,
      sess: Session,
      transcoder: Transcoder
    ): Option[Record] = {
      if (0 == transcoder.template.size) {
        None
      } else {
        val tmpl = rec.template
        val record = ArrayRecord(transcoder.template)
        for ((i, j) <- transcoder.mapping) {
          record.update(
            j,
            tmpl(i).dataTypeId match {
              case DataTypes.BasicList =>
                // FIXME: Handle the pathological case of ST(M)L in a BL
                BasicList(rec.apply(i).asInstanceOf[BasicList], true)
              case DataTypes.SubTemplateList =>
                sanitizeSTL(rec.apply(i).asInstanceOf[SubTemplateList], sess)
              case DataTypes.SubTemplateMultiList =>
                sanitizeSTML(rec.apply(i).asInstanceOf[SubTemplateMultiList], sess)
              case _ => rec.apply(i)
            }
          )
        }
        Option(record)
      }
    }

    /** Helper for sanitizeRecord() that handles a SubTemplateList. */
    private def sanitizeSTL(stl: SubTemplateList, sess: Session): SubTemplateList = {
      try {
        val transcoder = TranscoderMap.get(
          stl.template,
          if (stl.size > 0) {
            Option(stl(0))
          } else {
            None
          },
          sess
        )
        if (transcoder.template.size == 0) {
          // all IEs in template are gone; use the dummyTemplate
          SubTemplateList(dummyTemplate, stl.semantics).asInstanceOf[ExportSubTemplateList]
        } else {
          val subTmplList = SubTemplateList(transcoder.template, stl.semantics)
            .asInstanceOf[ExportSubTemplateList]
          for (
            r <- stl.iterator;
            rec <- sanitizeRecord(r, sess, transcoder)
          ) {
            subTmplList.append(rec)
          }
          subTmplList
        }
      } catch {
        // The STL contains a basicList, but since there are no records it is
        // impossible to check the IE of the BL is to know whether that BL
        // should remain; resolve the issue by punting and using the
        // dummyTemplate
        case _: RecordNeeded => SubTemplateList(dummyTemplate, stl.semantics)
            .asInstanceOf[ExportSubTemplateList]
      }
    }

    /** Helper for sanitizeRecord() that handles a SubTemplateMultiList. */
    private def sanitizeSTML(stml: SubTemplateMultiList, sess: Session): SubTemplateMultiList = {
      // the export STML
      val subTmplMultiList = SubTemplateMultiList(stml.semantics)
        .asInstanceOf[ExportSubTemplateMultiList]

      // helper function: handle the STML Entries from index `s` until `e`
      // that all use Template `t`.
      def doGroup(t: Template, s: Int, e: Int): Unit = {
        val transcoder = TranscoderMap.get(t, Option(stml(s)), sess)
        if (transcoder.template.size > 0) {
          for (i <- s until e) {
            subTmplMultiList.append(sanitizeRecord(stml(i), sess, transcoder).get)
          }
        }
      }

      // loop over the records, call doGroup() for those using same template
      var tmpl: Template = null
      var start = -1
      for (pos <- 0 until stml.size) {
        if (stml(pos).template != tmpl) {
          if (start != -1) {
            doGroup(tmpl, start, pos)
          }
          tmpl = stml(pos).template
          start = pos
        }
      }
      // handle final group
      if (start != -1) {
        doGroup(tmpl, start, stml.size)
      }
      // return the new STML
      subTmplMultiList
    }

    /** Sanitizes `record` and adds it to the output file. */
    override def add(record: Record): Unit = {
      // get or create the Transcoder for Record's Template
      val transcoder = TranscoderMap
        .get(record.template, Option(record), record.message.get.session)
      sanitizeRecord(record, record.message.get.session, transcoder) match {
        case Some(rec) => exportStream.add(rec)
        case None      => ()
      }
      ()
    }

  }

  /** Utility classes used by the SanitizingWriter class. */
  private object SanitizingWriter {
    def apply(dir: HPath, basename: String): SanitizingWriter = new SanitizingWriter(dir, basename)

    /** a dummy Template used by an outgoing STL when all IEs in the incoming STL's template have
      * been removed
      */
    private val dummyTemplate = Template
      .newTemplate(Seq[(InfoElement, Int)]((infoModel("paddingOctets"), 1)), "dummy_tmpl", None)

    /** Exception thrown when attempting to create a Transcoder, the supplied Template has an IE of
      * type BasicList, and there is no Record with which to determine the IE used by the BasicList.
      */
    private case class RecordNeeded() extends RuntimeException

    /** Contains a destination template and a mapping of the field positional mapping from a
      * specific source template to the destination template. The source template is maintained in a
      * separate Map-like structure which maps the source template to one of these Transcoder
      * objects.
      */
    private class Transcoder(val template: Template, val mapping: List[(Int, Int)])

    private object Transcoder {

      /** Creates a Transcoder for a Template given the Template and an optional Record that uses
        * that Template. `sess` is the input session for the Template and Record which is used to
        * get the template's ID.
        *
        * Throws [[RecordNeeded]] if the Template contains a BasicList and there is no Record from
        * which the List's InfoElement may be determined.
        */
      def apply(tmpl: Template, rec: Option[Record]): Transcoder = {
        var fields = Queue.empty[(InfoElement, Int)]
        var map = Queue.empty[(Int, Int)]
        // 'i' is index on source, 'j' is index on destination
        var i = -1
        var j = 0
        for ((ie, len) <- tmpl.iterator) {
          i += 1
          if (!toRemove.contains(ie)) {
            // we want this field unless this is a BasicList whose IE is in
            // the remove list
            if (ie.dataTypeId != DataTypes.BasicList) {
              fields = fields :+ ((ie, len))
              map = map :+ ((i, j))
              j += 1
            } else {
              val bl = rec.getOrElse(throw new RecordNeeded()).apply(i).asInstanceOf[BasicList]
              if (!toRemove.contains(bl.infoElement)) {
                fields = fields :+ ((ie, len))
                map = map :+ ((i, j))
                j += 1
              }
            }
          }
        }
        val template = Template.newTemplate(fields, tmpl.name.orNull, tmpl.description)
        new Transcoder(template, map.toList)
      }
    }
  }

  /** Sanitizes and joins the files specified in `files` that all exist in `dir` and all of whose
    * names begin with the same `basename` which is of the form "YYYYMMDD.HH." or
    * "YYYYMMDD.HH-PTddH."
    */
  private def sanitizeFilesBasename(dir: HPath, basename: String, files: Set[HPath]): Unit = {
    // file currently being written to
    var writer: SanitizingWriter = null

    // list of newly created files
    var newPaths = List.empty[HPath]

    // list of files that were successfully processed
    var removeList = List.empty[HPath]

    logger.trace(s"Sanitizing ${files.size} '$basename*' files in $dir/")
    val t0 = System.currentTimeMillis()

    Try {
      writer = SanitizingWriter(dir, basename)
      val originalPerm = writer.originalPermission
      if (maximumSize.isEmpty) {
        // process all input files
        for (f <- files) {
          // process all records in the input
          for (reader <- managed(Reader(f, codecFactory))) {
            for (record <- reader) {
              writer.add(record)
            }
            removeList = f +: removeList
          }
        }
      } else {
        // process all input files
        for (f <- files) {
          // process all records in the input
          for (reader <- managed(Reader(f, codecFactory))) {
            for (record <- reader) {
              if (writer.reachedMaxSize) {
                logger.trace(s"Closing file '${writer.getName}'")
                writer.close()
                newPaths = writer.exportFile +: newPaths
                writer = null
                logger.trace(s"Creating additional writer for '$basename*' files in $dir")
                writer = SanitizingWriter(dir, basename)
              }
              writer.add(record)
            }
            removeList = f +: removeList
          }
        }
      }
      writer.close()
      newPaths = writer.exportFile +: newPaths
      writer = null
      // restore the original permission bits on the new files
      for (
        perm <- originalPerm;
        f <- newPaths
      ) {
        fileSystem.setPermission(f, perm)
      }
      // logger.trace(s"Removing old '${basename}*' files from ${dir}/")
      for (f <- removeList) {
        Try {
          fileSystem.delete(f, false)
        } match {
          case Failure(e) => logger.warn(s"Failed to remove old file '$f': ${e.toString}")
          case _          =>
        }
      }
    } match {
      case Success(_) => logger.debug(
          f"Finished sanitizing ${files.size} '$basename*' files into ${newPaths.size} files in $dir/ in ${(System.currentTimeMillis() - t0).toDouble / 1000.0}%.3f seconds"
        )
      case Failure(e) =>
        logger.error(s"Failed to sanitize ${files.size} '$basename*' files in $dir/: $e")
        for (w <- Option(writer)) {
          newPaths = w.exportFile +: newPaths
        }
        for (f <- newPaths) {
          Try {
            fileSystem.delete(f, false)
          } match {
            case Success(_) => ()
            case Failure(ex) => logger
                .error(s"Failed to remove new file '${f.getName}' in $dir/", ex)
          }
        }
    }
  }

  /** A Runnable that sanitizes and joins the files that have the same basename. It assumes all
    * entries in `files` begin with `basename` and live in `dir`.
    */
  private case class BasenameFilesJob(dir: HPath, basename: String, files: Set[HPath])
      extends Runnable {
    def run(): Unit = {
      sanitizeFilesBasename(dir, basename, files)
      signalQueue.add(0)
      ()
    }
  }

  /** A Runnable that splits the files in `files` by unique basename and, for each basename,
    * sanitizes and joins those files into a new file. It assumes all entries in `files` live in
    * `dir`.
    */
  private case class DirectoryJob(dir: HPath, files: MutMap[String, Set[HPath]]) extends Runnable {
    def run(): Unit = {
      // Create threads to process the files for each unique basename in this
      // directory
      for ((basename, set) <- files) {
        sanitizeFilesBasename(dir, basename, set)
      }
      signalQueue.add(0)
      ()
    }
  }

  // /////  Constants & Values Determined from Properties  /////

  /** A Compression Codec Factory */
  private val codecFactory = new CompressionCodecFactory(hadoopConf)

  /** The compression codec used for files written to HDFS. This may be set by setting the
    * "mothra.filesanitizer.compression" property. If that property is not set, DEFAULT_COMPRESSION
    * is used.
    */
  val compressCodec = {
    val compressName = sys
      .props
      .get("mothra.filesanitizer.compression")
      .getOrElse(DEFAULT_COMPRESSION)
    if (compressName == "") {
      // logger.info("Using no compression for IPFIX files")
      None
    } else {
      Try {
        // logger.trace(s"have a name ${compressName}")
        val codec = codecFactory.getCodecByName(compressName)
        // logger.trace(s"have a codec ${codec}")
        // Make sure we can create a compressor, not using it here.
        codec.createCompressor()
        // logger.trace(s"have a compressor ${compressor}")
        codec
      } match {
        case Success(ok) =>
          // logger.info(s"Using ${compressName} compressor for IPFIX files")
          Option(ok)
        case Failure(e) =>
          logger.error(s"Unable to initialize compressor '$compressName' $e")
          logger.debug(s"Unable to initialize compressor '$compressName'", e)
          logger.warn("Using no compression for IPFIX files")
          None
      }
    }
  }
  // logger.trace(s"compressCodec is ${compressCodec}")

  /** The maximum number of filesanitizer threads to start. It defaults to the value
    * `DEFAULT_MAX_THREADS`.
    *
    * This run-time behavior may be modified by setting the mothra.filesanitizer.maxThreads
    * property.
    */
  val maxThreads = sys
    .props
    .get("mothra.filesanitizer.maxThreads")
    .map(_.toInt)
    .getOrElse(DEFAULT_MAX_THREADS)
  require(maxThreads >= 1)

  /** The (approximate) maximum size file to create. The default is no maximum. When a file's size
    * exceeds this value, the file is closed and a new file is started. Typically a file's size will
    * not exceed this value by more than the maximum size of an IPFIX message, 64k.
    */
  val maximumSize = sys.props.get("mothra.filesanitizer.maximumSize").map(_.toLong)

  /** The behavior as to whether a file-sanitizing thread is spawned...
    *
    * `by-directory`: for every directory that contains files to be sanitized, or
    *
    * `by-prefix`: for every unqiue basename prefix (that is, the file name without the UUID) (in a
    * single directory) that contains files to be sanitized. `by-hour` is an alias for `by-prefix`.
    *
    * The default is specified by the `DEFAULT_SPAWN_THREAD` variable. The run-time behavior may be
    * modified by setting the `mothra.filesanitizer.spawnThread` Java property to one of those
    * values.
    */
  val spawnThread = sys
    .props
    .get("mothra.filesanitizer.spawnThread")
    .getOrElse(DEFAULT_SPAWN_THREAD)

  /** Mapping from `spawnThread` value to `threadPerDirectory`. */
  val spawnThreadMap = Map("by-directory" -> true, "by-prefix" -> false, "by-hour" -> false)

  private val threadPerDirectory = Try {
    spawnThreadMap(spawnThread)
  }.getOrElse {
    val sb = new StringBuilder()
    spawnThreadMap
      .keys
      .addString(sb, "mothra.filesanitizer.spawnThread must be one of: '", "', '", "'")
    throw new Exception(sb.mkString)
  }

  // /////  FileSanitizer procedural code begins here  /////

  // ensure all source directories use the same file system
  val fileSystem = dirList.head.getFileSystem(hadoopConf)
  if (
    dirList
      .drop(1)
      .exists {
        _.getFileSystem(hadoopConf) != fileSystem
      }
  ) {
    logger.error("source directories use different file systems")
    throw new Exception("source directories use different file systems")
  }

  // log our settings
  logger.info("FileSanitizer settings::")
  logger.info(s"Number of top-level directories to scan: ${dirList.size}")
  logger.info(s"Maximum number of file joining threads: $maxThreads")
  logger.info(s"Policy for starting threads: $spawnThread")
  logger.info(s"Approximate maximum output file size: ${maximumSize.getOrElse("unlimited")}")
  logger.info(s"Output file compression: ${compressCodec.getOrElse("none")}")
  logger.info(
    s"""JVM Parameters: ${ManagementFactory
        .getRuntimeMXBean
        .getInputArguments
        .toArray
        .mkString(", ")}"""
  )
  logger.info(s"Elements to remove: ${toRemove.map(_.name).mkString(", ")}")

  /** Object used by sub-threads to signal to the main thread that they have completed. */
  private val signalQueue = new LinkedBlockingQueue[Int]()

  private val pool: ThreadPoolExecutor = new ThreadPoolExecutor(
    maxThreads, maxThreads, 0L, TimeUnit.SECONDS, new LinkedBlockingQueue[Runnable](),
    new PackerThreadFactory("FileSanitizerThread-")
  )

  /** How often to print log messages regarding the number of tasks, in seconds. */
  val logTaskCountInterval = 5

  // print task count every 5 seconds
  private val logTaskCountThread = Executors
    .newScheduledThreadPool(1, new PackerThreadFactory("LogTaskCounts-"))
  logTaskCountThread.scheduleAtFixedRate(
    new Thread() {
      override def run(): Unit = {
        val active = pool.getActiveCount()
        val completed = pool.getCompletedTaskCount()
        val total = pool.getTaskCount()
        logger.info(
          s"Directories to scan: ${dirList.size}, Total tasks: $total, Completed tasks: $completed, Active tasks: $active, Queued tasks: ${total - active - completed}"
        )
      }
    },
    logTaskCountInterval,
    logTaskCountInterval,
    TimeUnit.SECONDS
  )

  /** Regular expression that matches expected repo file names */
  private val repoFileRegex = new Regex("""\A(\d{8}\.\d{2}(?:-PT\d\d?H)?\.).*\Z""")

  logger.info(
    s"Starting recursive scan of ${dirList.size} director${if (1 == dirList.size) "y"
      else "ies"}"
  )

  // Recursively process all directories
  while (dirList.nonEmpty) {
    val dir = dirList.head
    dirList = dirList.tail

    logger.trace(s"Scanning directory '$dir/'")
    val fileMap: MutMap[String, Set[HPath]] = MutMap.empty
    val iter =
      try {
        fileSystem.listLocatedStatus(dir)
      } catch {
        case NonFatal(e) =>
          // return an empty iterator
          logger.warn(s"Unable to get status of '$dir/': ${e.getMessage}")
          new RemoteIterator[LocatedFileStatus]() {
            def hasNext: Boolean = false
            def next(): LocatedFileStatus = throw new NoSuchElementException()
          }
      }
    while (
      Try {
        if (!iter.hasNext) {
          // finished with this directory
          false
        } else {
          // found an entry
          val entry = iter.next()
          if (entry.isDirectory) {
            dirList = entry.getPath +: dirList
          } else if (entry.isFile) {
            // its a file, check if it matches the regex
            for (m <- repoFileRegex.findFirstMatchIn(entry.getPath.getName)) {
              fileMap.getOrElseUpdate(m.group(1), Set.empty) += entry.getPath
            }
          }
          true
        }
      } match {
        case Success(ok) => ok
        case Failure(e)  =>
          // ignore errors stat-ing files
          logger.debug(s"Unable to read directory entry: ${e.toString}")
          true
      }
    ) { /*empty-body*/
    }

    if (threadPerDirectory) {
      // Create a thread for all files in the directory
      pool.execute(DirectoryJob(dir, fileMap))
    } else {
      // Create threads to process the files for each unique basename in this
      // directory
      for ((basename, set) <- fileMap) {
        pool.execute(BasenameFilesJob(dir, basename, set))
      }
    }
  }

  // Finished scanning directories.  Wait for sanitizing threads to finish.
  logger.info("Completed recursive directory scan")
  logger.info(
    s"Waiting for ${pool.getTaskCount() - pool.getCompletedTaskCount()} of ${pool.getTaskCount()} tasks to complete..."
  )

  // all tasks are queued; shutdown the thread pool and allow the
  // running/queued tasks to complete
  pool.shutdown()

  // clear the signalQueue of previously completed tasks then wait for the
  // thread pool to terminate
  signalQueue.clear()
  while (!pool.isTerminated()) {
    signalQueue.poll(5, TimeUnit.SECONDS)
    // signalQueue.clear()
  }
  logger.debug("All tasks have completed")
  logTaskCountThread.shutdown()
  logTaskCountThread.awaitTermination(1, TimeUnit.SECONDS)

  logger.info("FileSanitizer is done")

}

// @LICENSE_FOOTER@
//
// Mothra 1.7
//
// Copyright 2025 Carnegie Mellon University.
//
// NO WARRANTY. THIS CARNEGIE MELLON UNIVERSITY AND SOFTWARE ENGINEERING INSTITUTE MATERIAL IS
// FURNISHED ON AN "AS-IS" BASIS. CARNEGIE MELLON UNIVERSITY MAKES NO WARRANTIES OF ANY KIND,
// EITHER EXPRESSED OR IMPLIED, AS TO ANY MATTER INCLUDING, BUT NOT LIMITED TO, WARRANTY OF FITNESS
// FOR PURPOSE OR MERCHANTABILITY, EXCLUSIVITY, OR RESULTS OBTAINED FROM USE OF THE MATERIAL.
// CARNEGIE MELLON UNIVERSITY DOES NOT MAKE ANY WARRANTY OF ANY KIND WITH RESPECT TO FREEDOM FROM
// PATENT, TRADEMARK, OR COPYRIGHT INFRINGEMENT.
//
// Licensed under a GNU GPL 2.0-style license, please see LICENSE.txt or contac
// permission@sei.cmu.edu for full terms.
//
// [DISTRIBUTION STATEMENT A] This material has been approved for public release and unlimited
// distribution.  Please see Copyright notice for non-US Government use and distribution.
//
// This Software includes and/or makes use of Third-Party Software each subject to its own license.
//
// DM24-1649
