Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Use scala-csv #27

Merged
merged 7 commits into from
Jan 10, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
9 changes: 9 additions & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
@@ -1,5 +1,14 @@
# Changelog

## 3.8.0
* Replace kantan.csv with scala-csv

## 3.7.2
* Parse barcodes from read IDs in demultiplexed mode

## 3.7.1
* Adjust handling of command-line arguments in demultiplexed FASTQ file case

## 3.7.0
* Support for processing demultiplexed FASTQ files

Expand Down
28 changes: 14 additions & 14 deletions build.sbt
Original file line number Diff line number Diff line change
Expand Up @@ -13,22 +13,21 @@ lazy val versions = new {
val acyclic = "0.2.1"
val betterFiles = "3.9.2"
val betterMonadicFor = "0.3.1"
val catsEffect3 = "3.5.1"
val catsEffect3 = "3.5.2"
val cats = "2.10.0"
val commonsIo = "2.13.0"
val commonsText = "1.10.0"
val commonsIo = "2.15.1"
val commonsText = "1.11.0"
val commonsMath3 = "3.6.1"
val fastutil = "8.5.12"
val fs2 = "3.8.0"
val kantanCodecs = "0.5.3"
val kantanCsv = "0.7.0"
val fs2 = "3.9.3"
val log4s = "1.10.0"
val logback = "1.2.11"
val logback = "1.2.13"
val munit = "0.7.29"
val munitCatsEffect3 = "1.0.7"
val samTools = "3.0.5"
val scalaCheck = "1.17.0"
val scalaTest = "3.2.16"
val scalaCsv = "1.3.10"
val scalaTest = "3.2.17"
val scalaTestPlusScalaCheck = "3.2.2.0"
val scopt = "4.1.0"
val slf4j = "1.7.36"
Expand All @@ -45,12 +44,11 @@ lazy val libraries = new {
val fastutil = "it.unimi.dsi" % "fastutil" % versions.fastutil
val fs2Core = "co.fs2" %% "fs2-core" % versions.fs2
val fs2Io = "co.fs2" %% "fs2-io" % versions.fs2
val kantanCodecs = "com.nrinaudo" %% "kantan.codecs" % versions.kantanCodecs
val kantanCsv = "com.nrinaudo" %% "kantan.csv" % versions.kantanCsv
val log4s = "org.log4s" %% "log4s" % versions.log4s
val logbackCore = "ch.qos.logback" % "logback-core" % versions.logback
val logbackClassic = "ch.qos.logback" % "logback-classic" % versions.logback
val samtools = "com.github.samtools" % "htsjdk" % versions.samTools
val scalaCsv = "com.github.tototoshi" %% "scala-csv" % versions.scalaCsv
val scopt = "com.github.scopt" %% "scopt" % versions.scopt
val slf4j = "org.slf4j" % "slf4j-api" % versions.slf4j

Expand All @@ -71,12 +69,11 @@ lazy val dependencies =
libraries.commonsIo,
libraries.commonsMath3,
libraries.fastutil,
libraries.kantanCodecs,
libraries.kantanCsv,
libraries.log4s,
libraries.logbackCore % Runtime,
libraries.logbackClassic % Runtime,
libraries.samtools,
libraries.scalaCsv,
libraries.scopt,
libraries.slf4j,
libraries.betterFiles % Test,
Expand Down Expand Up @@ -107,8 +104,11 @@ lazy val headerSettings = List(
lazy val assemblySettings = List(
assembly / assemblyJarName := "../bin/poolq3.jar",
assembly / assemblyMergeStrategy := {
case "logback.xml" => MergeStrategy.first
case "logback-test.xml" => MergeStrategy.discard
case "logback.xml" => MergeStrategy.first
case "logback-test.xml" => MergeStrategy.discard
case PathList("module-info.class") => MergeStrategy.discard
case PathList("META-INF", "versions", xs @ _, "module-info.class") => MergeStrategy.discard
case "module-info.class" => MergeStrategy.first
case x =>
val old = (assembly / assemblyMergeStrategy).value
old(x)
Expand Down
2 changes: 1 addition & 1 deletion project/build.properties
Original file line number Diff line number Diff line change
@@ -1 +1 @@
sbt.version=1.9.6
sbt.version=1.9.8
Original file line number Diff line number Diff line change
Expand Up @@ -10,8 +10,7 @@ import java.nio.file.Path

import scala.util.Using

import kantan.csv._
import kantan.csv.ops._
import com.github.tototoshi.csv._
import org.apache.commons.io.ByteOrderMark
import org.apache.commons.io.input.BOMInputStream
import org.broadinstitute.gpp.poolq3.reports.{GctDialect, PoolQ2Dialect, ReportsDialect}
Expand Down Expand Up @@ -77,34 +76,34 @@ object ReferenceData {
.setInclude(false)
.get()
val br = new BufferedReader(new InputStreamReader(in))
val delimiter = guessDelimiter(br)
val config =
CsvConfiguration(delimiter, quote, CsvConfiguration.QuotePolicy.WhenNeeded, CsvConfiguration.Header.None)
val guessedDelimiter = guessDelimiter(br)
implicit object CSVFormat extends DefaultCSVFormat {
override val delimiter = guessedDelimiter
override val quoteChar: Char = quote
}
skipHeader(br, LineRegex)
val reader = br.asCsvReader[List[String]](config)
val barcodes = reader.map {
case Right(xs) =>
xs match {
case barcodeRaw :: idRaw :: _ =>
// if the CSV parser leaves spaces, we should remove them
val barcode = barcodeRaw.trim()
val id = idRaw.trim()

// N.B. empty IDs are commonly used and must be supported; as long as the barcode is a non-empty, valid
// DNA string, we must accept the row. However, sometimes Excel leaves empty lines in exported CSV; as
// long as *both* the barcode and ID are empty, it's safe to just skip the row. For now we'll be paranoid
// and reject cases where the barcode is empty but the ID is non-empty
if (barcode.isEmpty && id.isEmpty) None
else if (isReferenceBarcode(barcode)) Some(ReferenceEntry(barcode, id))
else throw InvalidFileException(file, s"Invalid DNA barcode '$barcode' for ID '$id'")
case _ =>
throw InvalidFileException(
file,
s"Incorrect number of columns. At least 2 required, got: ${xs.length}: $xs"
)
}
case Left(value) => throw InvalidFileException(file, s"Unable to parse data ${value.getMessage}")
}.toList
val rows = CSVReader.open(br).all()
val barcodes = rows.map { case xs =>
xs match {
case barcodeRaw :: idRaw :: _ =>
// if the CSV parser leaves spaces, we should remove them
val barcode = barcodeRaw.trim()
val id = idRaw.trim()

// N.B. empty IDs are commonly used and must be supported; as long as the barcode is a non-empty, valid
// DNA string, we must accept the row. However, sometimes Excel leaves empty lines in exported CSV; as
// long as *both* the barcode and ID are empty, it's safe to just skip the row. For now we'll be paranoid
// and reject cases where the barcode is empty but the ID is non-empty
if (barcode.isEmpty && id.isEmpty) None
else if (isReferenceBarcode(barcode)) Some(ReferenceEntry(barcode, id))
else throw InvalidFileException(file, s"Invalid DNA barcode '$barcode' for ID '$id'")
case _ =>
throw InvalidFileException(
file,
s"Incorrect number of columns. At least 2 required, got: ${xs.length}: $xs"
)
}
}

if (barcodes.isEmpty) {
throw InvalidFileException(file, "Empty reference file")
Expand Down
2 changes: 1 addition & 1 deletion version.sbt
Original file line number Diff line number Diff line change
@@ -1 +1 @@
ThisBuild / version := "3.7.3-SNAPSHOT"
ThisBuild / version := "3.8.0-SNAPSHOT"