in dpgr_variant_pairs-full-tuples-2025-12-14.csv
Its columns are:
pair_id,DPGR Row,DPGR Location,DPGR Time Window,DPGR (Slope),Variant 1,Variant 2,Variant 1 Accession ID,Variant 2 Accession ID,Variant 1 Virus name,Variant 2 Virus name,Variant 1 Collection date,Variant 2 Collection date,Variant 1 Location,Variant 2 Location,Variant 1 Sequence length,Variant 2 Sequence length
It contain pairs if 'virus name' in 'Variant 1 Virus name', 'Variant 2 Virus name'.
Write a shell code to find the unique 'Virus name' and output them a csv file of "uniq_virus_names_from_dpgr_variant_pairs-full-tuples-2025-12-14.csv"
#!/bin/bash
INPUT_FILE="dpgr_variant_pairs-full-tuples-2025-12-14.csv"
OUTPUT_FILE="uniq_virus_names_from_dpgr_variant_pairs-full-tuples-2025-12-14.csv"
# Check if the input file exists
if [ ! -f "$INPUT_FILE" ]; then
echo "Error: Input file '$INPUT_FILE' not found."
exit 1
fi
# Use awk to select the 10th (Variant 1 Virus name) and 11th (Variant 2 Virus name) columns,
# then separate the header and body rows, print column contents on new lines,
# sort them, and finally get the unique names.
# The -F, sets the field delimiter to a comma.
# The 'NR==1' block handles the header: it prints the combined column titles.
# The 'NR>1' block handles the data: it prints the content of column 10 and 11 on separate lines.
# The '| tail -n +2' removes the combined header row from the second awk output,
# so only the virus names are passed to sort and uniq.
# The final result is redirected to the output CSV file.
{
# Print the header (just a descriptive column title for the output)
echo "Unique Virus Name"
# Process the data
awk -F, 'NR>1 {print $10; print $11}' "$INPUT_FILE" |
sort |
uniq
} > "$OUTPUT_FILE"
echo "✅ Success: Unique virus names extracted from '$INPUT_FILE' and saved to '$OUTPUT_FILE'."
echo "---"
echo "The file '$OUTPUT_FILE' contains:"
head "$OUTPUT_FILE"
echo "..."
0:20am. aws
(dpgr310) [hqin@ip-10-3-4-198 dpgr_build_training_data]$ python scripts/filter_fasta_by_virus_names.py --names-csv uniq_virus_names_from_dpgr_variant_pairs-full-tuples-2025-12-14.csv --fasta data/raw/sequences.fasta --output /tmp/output.fasta --missing-output /tmp/missing.txt &
(dpgr310) [hqin@ip-10-3-4-198 dpgr_build_training_data]$ grep ">" /tmp/output.fasta | head
>hCoV-19/USA/CA-CDC-QDX47542243/2023|2023-03-03|2023-03-20
>hCoV-19/USA/CA-CDPH-FS48082135/2022|2022-12-16|2023-01-30
>hCoV-19/Japan/PG-77665/2021|2021-07-15|2021-09-02
>hCoV-19/USA/CT-Yale-11825/2021|2021-10-05|2022-10-29
>hCoV-19/Israel/CVL-18032/2021|2021-08-11|2021-10-29
>hCoV-19/Denmark/DCGC-569088/2022|2022-08-21|2022-08-29
>hCoV-19/Indonesia/JK-NIHRD-WGS-22-16699/2022|2022-08-09|2022-08-23
>hCoV-19/USA/FL-BPHL-1692/2021|2021-03-01|2021-04-22
>hCoV-19/Russia/PRI-5829/2021|2021-01-18|2023-01-25
>hCoV-19/South Korea/KDCA25825/2021|2021-12-13|2022-01-12
(dpgr310) [hqin@ip-10-3-4-198 dpgr_build_training_data]$ grep hCoV-19/USA/CA-CDC-QDX47542243/2023 *csv
dpgr_variant_pairs-full-tuples-2025-12-14.csv:PAIR028262,463,North America,2023-02-13 to 2023-04-17,0.0061116162065131,XBB.1.5.1,XBB.1.5.66,EPI_ISL_18883356,EPI_ISL_17250301,hCoV-19/USA/NY-UB-ECMC-00747/2023hCoV-19/USA/CA-CDC-QDX47542243/2023,2023-03-20,2023-03-03,North America / USA / New York / Erie County,North America / USA / California,29837,29721
dpgr_variant_pairs-full-tuples-2025-12-14-small.csv:PAIR028262,463,North America,2023-02-13 to 2023-04-17,0.0061116162065131,XBB.1.5.1,XBB.1.5.66,EPI_ISL_18883356,EPI_ISL_17250301,hCoV-19/USA/NY-UB-ECMC-00747/2023,hCoV-19/USA/CA-CDC-QDX47542243/2023,2023-03-20,2023-03-03,North America / USA / New York / Erie County,North America / USA / California,29837,29721
sample_pair_validation_results.csv:463,EPI_ISL_18748636,EPI_ISL_17250301,"Virus name: hCoV-19/USA/IL-S23WGS0739/2023; Passage details/history: Original; Type: betacoronavirus; Accession ID: EPI_ISL_18748636; Collection date: 2023-02-26; Location: North America / USA / Illinois / Jefferson; Sequence length: 29752; Host: Human; Patient age: 80; Gender: Male; Clade: GRA; Pango lineage: XBB.1.5.1; Pango version: consensus call; Variant: Former VOI (XBB.1.5+XBB.1.5.*); AA Substitutions: (NSP5_P132H,NSP16_A168V,NSP12_G671S,NSP3_G489S,Spike_L24del,NSP4_T327I,Spike_N969K,Spike_H655Y,Spike_G142D,Spike_A27S,Spike_Q954H,N_P13L,Spike_N501Y,Spike_P25del,N_R32del,Spike_V213E,NS3_T223I,Spike_T19I,Spike_H146Q,M_Q19E,Spike_N440K,NSP4_T492I,Spike_N460K,Spike_N679K,Spike_N764K,E_T11A,NSP6_G107del,Spike_Y505H,NSP14_M58I,Spike_D796Y,Spike_T478K,M_A63T,Spike_R346T,Spike_S371F,Spike_K417N,NSP13_R392C,Spike_L368I,Spike_T376A,NSP6_S106del,Spike_F490S,Spike_R408S,NSP4_L438F,Spike_G339H,NSP14_I42V,NSP4_L264F,Spike_P681H,Spike_Y144del,Spike_V83A,NSP3_T24I,N_S33del,NSP1_S135R,Spike_S375F,Spike_D405N,Spike_Q498R,NSP13_S36P,Spike_Q183E,Spike_S477N,N_E31del,NSP15_T112I,NSP6_F108del,Spike_T573I,E_T9I,NSP1_K47R,Spike_P26del,NSP12_P323L,Spike_D614G,Spike_G252V); Submission date: 2024-01-11; Is complete?: True; N-Content: 0.0254772788384; GC-Content: 0.378940470442; Region: North America","Virus name: hCoV-19/USA/CA-CDC-QDX47542243/2023; Passage details/history: Original; Type: betacoronavirus; Accession ID: EPI_ISL_17250301; Collection date: 2023-03-03; Location: North America / USA / California; Sequence length: 29721; Host: Human; Patient age: 55; Gender: Female; Clade: GRA; Pango lineage: XBB.1.5.66; Pango version: PANGO-v1.23; Variant: Former VOI (XBB.1.5+XBB.1.5.*); AA Substitutions: (NSP5_P132H,NSP12_G671S,NSP3_G489S,Spike_L24del,NSP4_T327I,Spike_S373P,Spike_N969K,Spike_H655Y,N_R203K,NSP2_G339S,Spike_G142D,Spike_A27S,Spike_Q954H,N_P13L,Spike_N501Y,Spike_P25del,N_R32del,Spike_V213E,NS3_T223I,Spike_T19I,Spike_H146Q,M_Q19E,Spike_N440K,NSP4_T492I,Spike_N460K,Spike_N679K,Spike_N764K,E_T11A,NSP6_G107del,Spike_Y505H,Spike_D796Y,N_G204R,Spike_T478K,N_S413R,M_A63T,Spike_R346T,NSP6_M143I,Spike_S371F,Spike_V445P,NSP13_R392C,Spike_K417N,Spike_L368I,Spike_T376A,NSP6_S106del,NS8_G8stop,Spike_F490S,Spike_F486P,Spike_R408S,NSP4_L438F,Spike_G339H,NSP14_I42V,NSP4_L264F,Spike_P681H,Spike_Y144del,Spike_V83A,NSP3_T24I,N_S33del,NSP1_S135R,Spike_S375F,Spike_D405N,Spike_Q498R,Spike_G446S,NSP13_S36P,Spike_Q183E,Spike_S477N,N_E31del,NSP15_T112I,NSP6_F108del,Spike_E484A,E_T9I,NS7a_Q94H,NSP1_K47R,Spike_P26del,NSP12_P323L,Spike_D614G,Spike_G252V); Submission date: 2023-03-20; Is complete?: True; GC-Content: 0.379125870597; Region: North America",Correct,
sample_pair_validation_results.csv:463,EPI_ISL_18748636,EPI_ISL_17250301,"Virus name: hCoV-19/USA/IL-S23WGS0739/2023; Passage details/history: Original; Type: betacoronavirus; Accession ID: EPI_ISL_18748636; Collection date: 2023-02-26; Location: North America / USA / Illinois / Jefferson; Sequence length: 29752; Host: Human; Patient age: 80; Gender: Male; Clade: GRA; Pango lineage: XBB.1.5.1; Pango version: consensus call; Variant: Former VOI (XBB.1.5+XBB.1.5.*); AA Substitutions: (NSP5_P132H,NSP16_A168V,NSP12_G671S,NSP3_G489S,Spike_L24del,NSP4_T327I,Spike_N969K,Spike_H655Y,Spike_G142D,Spike_A27S,Spike_Q954H,N_P13L,Spike_N501Y,Spike_P25del,N_R32del,Spike_V213E,NS3_T223I,Spike_T19I,Spike_H146Q,M_Q19E,Spike_N440K,NSP4_T492I,Spike_N460K,Spike_N679K,Spike_N764K,E_T11A,NSP6_G107del,Spike_Y505H,NSP14_M58I,Spike_D796Y,Spike_T478K,M_A63T,Spike_R346T,Spike_S371F,Spike_K417N,NSP13_R392C,Spike_L368I,Spike_T376A,NSP6_S106del,Spike_F490S,Spike_R408S,NSP4_L438F,Spike_G339H,NSP14_I42V,NSP4_L264F,Spike_P681H,Spike_Y144del,Spike_V83A,NSP3_T24I,N_S33del,NSP1_S135R,Spike_S375F,Spike_D405N,Spike_Q498R,NSP13_S36P,Spike_Q183E,Spike_S477N,N_E31del,NSP15_T112I,NSP6_F108del,Spike_T573I,E_T9I,NSP1_K47R,Spike_P26del,NSP12_P323L,Spike_D614G,Spike_G252V); Submission date: 2024-01-11; Is complete?: True; N-Content: 0.0254772788384; GC-Content: 0.378940470442; Region: North America","Virus name: hCoV-19/USA/CA-CDC-QDX47542243/2023; Passage details/history: Original; Type: betacoronavirus; Accession ID: EPI_ISL_17250301; Collection date: 2023-03-03; Location: North America / USA / California; Sequence length: 29721; Host: Human; Patient age: 55; Gender: Female; Clade: GRA; Pango lineage: XBB.1.5.66; Pango version: PANGO-v1.23; Variant: Former VOI (XBB.1.5+XBB.1.5.*); AA Substitutions: (NSP5_P132H,NSP12_G671S,NSP3_G489S,Spike_L24del,NSP4_T327I,Spike_S373P,Spike_N969K,Spike_H655Y,N_R203K,NSP2_G339S,Spike_G142D,Spike_A27S,Spike_Q954H,N_P13L,Spike_N501Y,Spike_P25del,N_R32del,Spike_V213E,NS3_T223I,Spike_T19I,Spike_H146Q,M_Q19E,Spike_N440K,NSP4_T492I,Spike_N460K,Spike_N679K,Spike_N764K,E_T11A,NSP6_G107del,Spike_Y505H,Spike_D796Y,N_G204R,Spike_T478K,N_S413R,M_A63T,Spike_R346T,NSP6_M143I,Spike_S371F,Spike_V445P,NSP13_R392C,Spike_K417N,Spike_L368I,Spike_T376A,NSP6_S106del,NS8_G8stop,Spike_F490S,Spike_F486P,Spike_R408S,NSP4_L438F,Spike_G339H,NSP14_I42V,NSP4_L264F,Spike_P681H,Spike_Y144del,Spike_V83A,NSP3_T24I,N_S33del,NSP1_S135R,Spike_S375F,Spike_D405N,Spike_Q498R,Spike_G446S,NSP13_S36P,Spike_Q183E,Spike_S477N,N_E31del,NSP15_T112I,NSP6_F108del,Spike_E484A,E_T9I,NS7a_Q94H,NSP1_K47R,Spike_P26del,NSP12_P323L,Spike_D614G,Spike_G252V); Submission date: 2023-03-20; Is complete?: True; GC-Content: 0.379125870597; Region: North America",Correct,
uniq_virus_names_from_dpgr_variant_pairs-full-tuples-2025-12-14.csv:hCoV-19/USA/CA-CDC-QDX47542243/2023