Skip to content

Commit

Permalink
Merge pull request #428 from vgteam/lancet-import
Browse files Browse the repository at this point in the history
Add a script that can batch import and explain Lancet data in the November format
  • Loading branch information
adamnovak committed May 7, 2024
2 parents b76452a + 38ec2e6 commit 5b1a3f5
Show file tree
Hide file tree
Showing 5 changed files with 209 additions and 52 deletions.
141 changes: 141 additions & 0 deletions scripts/prepare_lancet_output.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,141 @@
#!/usr/bin/env bash
# prepare_lancet_input.sh: Turn a directory of Lancet output into a Tube Map BED file and directory that can be hosted on the web.

function usage() {
echo >&2 "${0}: Prepare a directory of Lancet data ."
echo >&2
echo >&2 "The input directory should have in it:"
echo >&2 "- variant_calling_regions.with_calls.bed, an extended BED file with biallelic VCF REF in column 6, ALT in column 7, and INFO in column 9 with a TYPE field"
echo >&2 "- giraffe_alignments/, a directory with, for each VCF record:"
echo >&2 " - <CONTIG>_<START>_<END>.giraffe.gbz"
echo >&2 " - normal.<CONTIG>_<START>_<END>.gam"
echo >&2 " - tumor.<CONTIG>_<START>_<END>.gam"
echo >&2 "The ranges in the file named will not correspond exactly to those in the BED records; files for a range overlapping and centered on the BED record will be used."
echo >&2
echo >&2 "Usage: ${0} <INPUT_DIR> <OUTPUT_DIR>"
exit 1
}

INPUT_DIR="${1}"
OUTPUT_DIR="${2}"

if [[ -z "${OUTPUT_DIR}" || -z "${INPUT_DIR}" || ! -d "${INPUT_DIR}" ]] ; then
usage
fi

# Make all the paths absolute
INPUT_DIR="$(realpath "${INPUT_DIR}")"
OUTPUT_DIR="$(realpath "${OUTPUT_DIR}")"

INPUT_BED="${INPUT_DIR}/variant_calling_regions.with_calls.bed"
INPUT_DATA_DIR="${INPUT_DIR}/giraffe_alignments"

if [[ ! -f "${INPUT_BED}" || ! -d "${INPUT_DATA_DIR}" ]] ; then
usage
fi


SCRIPTS_DIR="$(dirname "$(realpath "${BASH_SOURCE[0]}")")"

mkdir -p "${OUTPUT_DIR}"

OUTPUT_BED="${OUTPUT_DIR}/index.bed"
rm -f "${OUTPUT_BED}"

# We need to ber in the output directory to use relative paths in the generated BED.
cd "${OUTPUT_DIR}"

while IFS="" read -r INPUT_LINE || [[ -n "${INPUT_LINE}" ]] ; do
# Read all the lines, even if the newline is missing. See <https://stackoverflow.com/a/1521498>

# Parse the BED
CONTIG="$(echo "${INPUT_LINE}" | cut -f1)"
# These are 0-based, end-exclusive coordinates
START_BED_POS="$(echo "${INPUT_LINE}" | cut -f2)"
END_BED_POS="$(echo "${INPUT_LINE}" | cut -f3)"
REF_SEQ="$(echo "${INPUT_LINE}" | cut -f6)"
ALT_SEQ="$(echo "${INPUT_LINE}" | cut -f7)"
INFO="$(echo "${INPUT_LINE}" | cut -f9)"

# Compute 1-based, end-inclusive coordinates
START_POS=$((START_BED_POS+1))
END_POS="${END_BED_POS}"

# Fetch out the type (MNP, INS, etc.)
VARIANT_TYPE="$(echo "${INFO}" | grep -o "TYPE=[A-Z]*" | cut -f2 -d'=')"

# Find the best overlapping graph file.
# These are the 1-based, end-exclusive coordinates from the filenames
BEST_START=""
BEST_END=""
# These are the 0-based, end-exclusive coordinates
BEST_BED_START=""
BEST_BED_END=""
# This is how many more bases are on one side fo the variant than are on the other
BEST_UNEVENNESS=""

CANDIDATES=("${INPUT_DATA_DIR}/${CONTIG}_"*".giraffe.gbz")
for GRAPH_FILE in "${CANDIDATES[@]}" ; do
GRAPH_BASENAME="$(basename "${GRAPH_FILE}")"
# Work out the region this graph covers. Remember to convert to 1-based
GRAPH_START="$(echo "${GRAPH_BASENAME%.giraffe.gbz}" | cut -f2 -d'_')"
GRAPH_END="$(echo "${GRAPH_BASENAME%.giraffe.gbz}" | cut -f3 -d'_')"

# Convert to 0-based, end-exclusive
GRAPH_BED_START=$((GRAPH_START-1))
GRAPH_BED_END="${GRAPH_END}"

BEFORE_BASES=$((START_BED_POS-GRAPH_BED_START))
AFTER_BASES=$((GRAPH_BED_END-END_BED_POS))
if [[ "${BEFORE_BASES}" -lt 0 || "${AFTER_BASES}" -lt 0 ]] ; then
# This graph does not overlap the variant we are working on
continue
fi

# Work out how uneven the centering is
if [[ "${BEFORE_BASES}" -gt "${AFTER_BASES}" ]] ; then
UNEVENNESS=$((BEFORE_BASES-AFTER_BASES))
else
UNEVENNESS=$((AFTER_BASES-BEFORE_BASES))
fi

if [[ -z "${BEST_UNEVENNESS}" || "${BEST_UNEVENNESS}" -gt "${UNEVENNESS}" ]] ; then
# This is the best graph so far
BEST_START="${GRAPH_START}"
BEST_END="${GRAPH_END}"
BEST_BED_START="${GRAPH_BED_START}"
BEST_BED_END="${GRAPH_BED_END}"
BEST_UNEVENNESS="${UNEVENNESS}"
fi
done

if [[ -z "${BEST_START}" ]] ; then
echo >&2 "No graph in ${INPUT_DATA_DIR} overlaps the variant ${INPUT_LINE}"
exit 1
fi

# Find the best graph again
INPUT_GRAPH="$(find "${INPUT_DATA_DIR}" -name "${CONTIG}_${BEST_START}_${BEST_END}.giraffe.gbz")"
INPUT_GRAPH_BASENAME="$(basename "${INPUT_GRAPH}")"
SLUG="${INPUT_GRAPH_BASENAME%.giraffe.gbz}"

INPUT_NORMAL_GAM="${INPUT_DATA_DIR}/normal.${SLUG}.gam"
INPUT_TUMOR_GAM="${INPUT_DATA_DIR}/tumor.${SLUG}.gam"

# Make the chunk directory. Remember to use a 1-based, end-inclusive range and description coordinates.
"${SCRIPTS_DIR}/prepare_local_chunk.sh" \
-x "${INPUT_GRAPH}" \
-g "${INPUT_NORMAL_GAM}" \
-p '{"mainPalette": "blues", "auxPalette": "blues"}' \
-g "${INPUT_TUMOR_GAM}" \
-p '{"mainPalette": "reds", "auxPalette": "reds"}' \
-r "${CONTIG}:${BEST_START}-${BEST_END}" \
-d "Tumor-specific ${REF_SEQ} -> ${ALT_SEQ} ${VARIANT_TYPE} variant at ${CONTIG}:${START_POS}" \
-o "${SLUG}" \
>>"${OUTPUT_BED}"

# We want to hide the path cover paths from Giraffe and only show the paths from the original Lancet GFA
vg paths --drop-paths --paths-by "path_cover_" -x "${SLUG}/chunk.vg" >"${SLUG}/chunk.vg.new"
mv "${SLUG}/chunk.vg.new" "${SLUG}/chunk.vg"

done <"${INPUT_BED}"
64 changes: 32 additions & 32 deletions scripts/prepare_local_chunk.sh
Original file line number Diff line number Diff line change
Expand Up @@ -15,11 +15,11 @@ NODE_COLORS=()
while getopts x:g:p:r:o:d:n: flag
do
case "${flag}" in
x) GRAPH_FILE=${OPTARG};;
x) GRAPH_FILE="${OPTARG}";;
g) GAM_FILES+=("$OPTARG");;
p) GAM_PALETTES+=("$OPTARG");;
r) REGION=${OPTARG};;
o) OUTDIR=${OPTARG};;
r) REGION="${OPTARG}";;
o) OUTDIR="${OPTARG}";;
d) DESC="${OPTARG}";;
n) NODE_COLORS+=("${OPTARG}");;
*)
Expand Down Expand Up @@ -57,45 +57,45 @@ if [[ -z "${DESC}" ]] ; then
DESC="Region ${REGION}"
fi

echo >&2 "Graph File: " $GRAPH_FILE
echo >&2 "Region: " $REGION
echo >&2 "Output Directory: " $OUTDIR
echo >&2 "Node colors: " ${NODE_COLORS[@]}
echo >&2 "Graph File: $GRAPH_FILE"
echo >&2 "Region: $REGION"
echo >&2 "Output Directory: $OUTDIR"
echo >&2 "Node colors: ${NODE_COLORS[@]}"

rm -fr $OUTDIR
mkdir -p $OUTDIR
rm -fr "$OUTDIR"
mkdir -p "$OUTDIR"

TEMP="$(mktemp -d)"


# Covert GAF files to GAM
for i in "${!GAM_FILES[@]}"; do
if [[ ${GAM_FILES[$i]} == *.gaf ]]; then
if [[ "${GAM_FILES[$i]}" == *.gaf ]]; then
# Filename without path
filename=$(basename -- ${GAM_FILES[$i]})
filename="$(basename -- "${GAM_FILES[$i]}")"
# Remove file extension
filename=${filename%.*}
vg convert --gaf-to-gam ${GAM_FILES[$i]} ${GRAPH_FILE} > $TEMP/${filename}.gam
filename="${filename%.*}"
vg convert --gaf-to-gam "${GAM_FILES[$i]}" "${GRAPH_FILE}" > "$TEMP/${filename}.gam"
GAM_FILES[$i]="$TEMP/${filename}.gam"
fi
done

# Parse the region
REGION_END="$(echo ${REGION} | rev | cut -f1 -d'-' | rev)"
REGION_START="$(echo ${REGION} | rev | cut -f2 -d'-' | cut -f1 -d':' | rev)"
REGION_CONTIG="$(echo ${REGION} | rev| cut -f2- -d':' | rev)"
REGION_END="$(echo "${REGION}" | rev | cut -f1 -d'-' | rev)"
REGION_START="$(echo "${REGION}" | rev | cut -f2 -d'-' | cut -f1 -d':' | rev)"
REGION_CONTIG="$(echo "${REGION}" | rev| cut -f2- -d':' | rev)"

# get path relative to directory above the scripts directory
GRAPH_FILE_PATH=$(realpath --relative-to $(dirname ${BASH_SOURCE[0]})/../ $GRAPH_FILE)
GRAPH_FILE_PATH="$(realpath --relative-to "$(dirname "${BASH_SOURCE[0]}")/../" "$GRAPH_FILE")"

# construct track JSON for graph file
GRAPH_PALETTE="$(cat "$(dirname ${BASH_SOURCE[0]})/../src/config.json" | jq '.defaultGraphColorPalette')"
jq -n --arg trackFile "${GRAPH_FILE_PATH}" --arg trackType "graph" --argjson trackColorSettings "$GRAPH_PALETTE" '$ARGS.named' >> $OUTDIR/temp.json
GRAPH_PALETTE="$(cat "$(dirname "${BASH_SOURCE[0]}")/../src/config.json" | jq '.defaultGraphColorPalette')"
jq -n --arg trackFile "${GRAPH_FILE_PATH}" --arg trackType "graph" --argjson trackColorSettings "$GRAPH_PALETTE" '$ARGS.named' >> "$OUTDIR/temp.json"

# Put the graphy file in place
vg convert -p "${GRAPH_FILE}" > $OUTDIR/chunk.vg
vg convert -p "${GRAPH_FILE}" > "$OUTDIR/chunk.vg"
# Start the region BED inside the chunk
printf "${REGION_CONTIG}\t${REGION_START}\t${REGION_END}" > $OUTDIR/regions.tsv
printf "${REGION_CONTIG}\t${REGION_START}\t${REGION_END}" > "$OUTDIR/regions.tsv"


echo >&2 "Gam Files:"
Expand All @@ -107,9 +107,9 @@ for GAM_FILE in "${GAM_FILES[@]}"; do
if [[ -z "${GAM_PALETTE}" ]] ; then
GAM_PALETTE="${DEFAULT_READ_PALETTE}"
fi
GAM_FILE_PATH=$(realpath --relative-to $(dirname ${BASH_SOURCE[0]})/../ $GAM_FILE)
GAM_FILE_PATH=$(realpath --relative-to "$(dirname "${BASH_SOURCE[0]}")/../" "$GAM_FILE")
# construct track JSON for each gam file
jq -n --arg trackFile "${GAM_FILE_PATH}" --arg trackType "read" --argjson trackColorSettings "$GAM_PALETTE" '$ARGS.named' >> $OUTDIR/temp.json
jq -n --arg trackFile "${GAM_FILE_PATH}" --arg trackType "read" --argjson trackColorSettings "$GAM_PALETTE" '$ARGS.named' >> "$OUTDIR/temp.json"
# Work out a chunk-internal GAM name with the same leading numbering vg chunk uses
if [[ "${GAM_NUM}" == "0" ]] ; then
GAM_LEADER="chunk"
Expand All @@ -120,35 +120,35 @@ for GAM_FILE in "${GAM_FILES[@]}"; do
# Put the chunk in place
cp "${GAM_FILE}" "${GAM_CHUNK_NAME}"
# List it in the regions TSV like vg would
printf "\t$(basename "${GAM_CHUNK_NAME}")" >> $OUTDIR/regions.tsv
printf "\t$(basename "${GAM_CHUNK_NAME}")" >> "$OUTDIR/regions.tsv"
GAM_NUM=$((GAM_NUM + 1))
done

# put all tracks objects into an array
(jq -s '.' < $OUTDIR/temp.json) > $OUTDIR/tracks.json
(jq -s '.' < "$OUTDIR/temp.json") > "$OUTDIR/tracks.json"

rm $OUTDIR/temp.json
rm "$OUTDIR/temp.json"

# construct node file
if [[ ! -z "${NODE_COLORS}" ]] ; then
for NODENAME in "${NODE_COLORS[@]}"; do
echo >&2 "Highlighted node: $NODENAME"
printf "$NODENAME\n" >> $OUTDIR/nodeColors.tsv
printf "$NODENAME\n" >> "$OUTDIR/nodeColors.tsv"
done
fi

# Make the empty but required annotation file. We have no haplotypes to put in it.
touch "${OUTDIR}/chunk_0_${REGION_CONTIG}_${REGION_START}_${REGION_END}.annotate.txt"
printf "\tchunk_0_${REGION_CONTIG}_${REGION_START}_${REGION_END}.annotate.txt\n" >> $OUTDIR/regions.tsv
printf "\tchunk_0_${REGION_CONTIG}_${REGION_START}_${REGION_END}.annotate.txt\n" >> "$OUTDIR/regions.tsv"

for file in `ls $OUTDIR/`
for file in `ls "$OUTDIR/"`
do
printf "$file\n" >> $OUTDIR/chunk_contents.txt
printf "$file\n" >> "$OUTDIR/chunk_contents.txt"
done

# Print BED line
cat $OUTDIR/regions.tsv | cut -f1-3 | tr -d "\n"
cat "$OUTDIR/regions.tsv" | cut -f1-3 | tr -d "\n"
printf "\t${DESC}\t${OUTDIR}\n"

rm -fr $TEMP
rm -fr "$TEMP"

14 changes: 9 additions & 5 deletions src/components/CopyLink.js
Original file line number Diff line number Diff line change
Expand Up @@ -86,13 +86,17 @@ export const urlParamsToViewTarget = (url) => {
result = qs.parse(s[1]);
}

// Ensures that the simplify field is a boolean, as the qs module can't tell


// Ensures that the flag fields are booleans, as the qs module can't tell
// the difference between false and "false"
if (result != null) {
if (result.simplify === "true") {
result.simplify = true;
} else if (result.simplify === "false") {
result.simplify = false;
for (let flag_name of ["simplify", "removeSequences"]) {
if (result[flag_name] === "true") {
result[flag_name] = true;
} else if (result[flag_name] === "false") {
result[flag_name] = false;
}
}
}

Expand Down
15 changes: 12 additions & 3 deletions src/server.mjs
Original file line number Diff line number Diff line change
Expand Up @@ -1828,12 +1828,17 @@ async function getBedRegions(bed) {
}

lines = bed_data.split(/\r?\n/);
lines.map(function (line) {

for (let [index, line] of lines.entries()) {
let records = line.split("\t");

if (records.length < 3) {
// This is an empty line or otherwise not BED
return;
if (line !== "") {
// This is a bad line
throw new BadRequestError("BED line " + (index + 1) + " could not be parsed");
}
continue;
}
bed_info["chr"].push(records[0]);
bed_info["start"].push(records[1]);
Expand All @@ -1848,7 +1853,11 @@ async function getBedRegions(bed) {
chunk = records[4];
}
bed_info["chunk"].push(chunk);
});
}

if (bed_info.length === 0) {
BadRequestError("BED file is empty");
}

// check for a tracks.json file to prefill tracks configuration
for (let i = 0; i < bed_info["chunk"].length; i++) {
Expand Down
27 changes: 15 additions & 12 deletions src/util/tubemap.js
Original file line number Diff line number Diff line change
Expand Up @@ -318,7 +318,7 @@ export function changeAllTracksVisibility(value) {
let i = 0;
while (i < inputTracks.length) {
inputTracks[i].hidden = !value;
var checkbox = document.getElementById(`showTrack${i}`);
var checkbox = document.getElementById(`showTrack${inputTracks[i].id}`);
checkbox.checked = value;
i += 1;
}
Expand Down Expand Up @@ -661,14 +661,16 @@ function placeReads() {

// Space out read tracks if multiple exist
let topMargin = allSources.length > 1 ? READ_WIDTH : 0;
for (let source of allSources) {
// Go through all source tracks in order
sortedNodes.forEach((node) => {
// And for each node, place these reads in it.
sortedNodes.forEach((node) => {
// For each node
for (let source of allSources) {
// Go through all source tracks in order

// Place the reads from this source in this node.
// Use a margin to separate multiple read tracks if we have them.
placeReadSet(readsBySource[source], node, topMargin);
});
}
}
});

// place read segments which are without node
const bottomY = calculateBottomY();
Expand Down Expand Up @@ -4070,6 +4072,7 @@ function drawLegend() {
content +=
'<table class="table-sm table-condensed table-nonfluid"><thead><tr><th>Color</th><th>Trackname</th><th>Show Track</th></tr></thead>';
const listeners = [];
// This is in terms of tracks, but when we change visibility we need to touch inputTracks, so we need to set up listeners by track ID.
for (let i = 0; i < tracks.length; i += 1) {
if (tracks[i].type === "haplo") {
content += `<tr><td style="text-align:right"><div class="color-box" style="background-color: ${generateTrackColor(
Expand All @@ -4081,17 +4084,17 @@ function drawLegend() {
} else {
content += `<td>${tracks[i].id}</td>`;
}
content += `<td><input type="checkbox" checked=true id="showTrack${i}"></td>`;
listeners.push(i);
content += `<td><input type="checkbox" checked=true id="showTrack${tracks[i].id}"></td>`;
listeners.push(tracks[i].id);
}
}
content += "</table";
// $('#legendDiv').html(content);
document.getElementById("legendDiv").innerHTML = content;
listeners.forEach((i) => {
listeners.forEach((id) => {
document
.getElementById(`showTrack${i}`)
.addEventListener("click", () => changeTrackVisibility(i), false);
.getElementById(`showTrack${id}`)
.addEventListener("click", () => changeTrackVisibility(id), false);
});
document
.getElementById("selectall")
Expand Down

0 comments on commit 5b1a3f5

Please sign in to comment.