#!/bin/zsh #Tamsyn May 2015, final version #Target gene assembly using DNA-seq reads and mRNA (or genomic) target sequence #more efficent assembly than original target_gene_assembly.sh, blasts against all transcripts in transcripts.fasta and separates rather than separate blasts for each individual gene #similarity filtering set at 95% identity for roach template, adjust to 75% identity for zf template #input files and location: # ~/tamsyn_roach/target_gene_assembly/read1.fasta # ~/tamsyn_roach/target_gene_assembly/read2.fasta # transcripts.fasta **** vital this file is in the right format: #header lines: >gene-transcript_id or >gene-roach_ncbi or > gene-consensus #gene name must be first and must be followed by a '-' rather than '_' #when there are multiple transcripts for a single gene, gene name must be identical i.e. >sox3-c90600_g3_i1 and >sox3-c90600_g5_i #no white space/artifical line breaks between lines of the fasta format #######Start with ground work for loops######## #extract list of unique gene names (no transcript numbers attached) cat transcripts.fasta | grep ">" | awk -F "-" '{print $1}' | sed 's/>//' | sort -u > gene_names.txt #create a folder for each gene called 'gene_name' for gene in $(cat gene_names.txt) do mkdir $gene done #separate transcripts.fasta into individual fasta files for each gene, saved in gene folder #first create a fasta.txt file with joined header & sequence in single line cat transcripts.fasta| tr '\n' '@' | sed 's/@>/£>/g' | tr '£' '\n' | sed '/^$/d' > transcripts.fasta.txt #extract each line (sequence) matching each gene, grep -w specifies match whole word otherwise "ar" also returns "bcar" etc, convert back to fasta and save in relevant folder for line in $(cat gene_names.txt) do less transcripts.fasta.txt| grep -w $line | tr '@' '\n' > $line/$line.fasta done #########First round of blast####### formatdb -i transcripts.fasta -p F blastall -p blastn -d transcripts.fasta -i ~/tamsyn_roach/target_gene_assembly/read1.fasta -e 1e-5 -m 8 -o blast_out_R1 blastall -p blastn -d transcripts.fasta -i ~/tamsyn_roach/target_gene_assembly/read2.fasta -e 1e-5 -m 8 -o blast_out_R2 #extracting full (fasta) read sequences from blast hits (read names only) #use join to extract fasta sequences for blast hits cat blast_out_R1 | awk '{print $1}' | sort -u > sorted_blast_out_R1 join sorted_blast_out_R1 ~/tamsyn_roach/target_gene_assembly/sorted_read1.fasta.text -t '!' > R1_blast_hits.fasta.text cat blast_out_R2 | awk '{print $1}' | sort -u > sorted_blast_out_R2 join sorted_blast_out_R2 ~/tamsyn_roach/target_gene_assembly/sorted_read1.fasta.text -t '!' > R2_blast_hits.fasta.text ###########################FILTERING AND CAP3################################################# #select blast hits based on sequence length (using 50bp cutoff, and 95 % identity) for feeding into assembly less blast_out_R1 | awk '{ if ($4 >= 50) if ($3 >= 95) print $1"!"$2}' | sort -u > R1_50bp_ids less blast_out_R2 | awk '{ if ($4 >= 50) if ($3 >= 95) print $1"!"$2}' | sort -u > R2_50bp_ids join R1_50bp_ids R1_blast_hits.fasta.text -t '!' | tr '!' '\t' | awk '{print ">"$1"_1-"$2"!"$3}' | sort -u > good_R1_blast_hits.text join R2_50bp_ids R2_blast_hits.fasta.text -t '!' | tr '!' '\t' | awk '{print ">"$1"_1-"$2"!"$3}' | sort -u > good_R2_blast_hits.text cat good_R1_blast_hits.text good_R2_blast_hits.text > all_good_blast_hits.text for line in $(cat gene_names.txt) do less all_good_blast_hits.text| grep -w $line | tr '!' '\n' > $line/cap.fasta cd $line cap3 cap.fasta > cap3_out less cap.fasta.cap.contigs | sed "s/Contig/$line-c/g" > ../$line-contigs.fasta cd .. done cat *contigs.fasta > all_cap1.contigs ####################Second round blasting################################################# formatdb -p f -i all_cap1.contigs blastall -p blastn -d all_cap1.contigs -i ~/tamsyn_roach/target_gene_assembly/read1.fasta -e 1e-5 -m 8 -o blast_out_R1_2 blastall -p blastn -d all_cap1.contigs -i ~/tamsyn_roach/target_gene_assembly/read2.fasta -e 1e-5 -m 8 -o blast_out_R2_2 ###########################FILTERING AND CAP3################################################# cat blast_out_R1_2 | awk '{ if ($4 >= 50) if ($3 >= 95) print $1"!"$2}' | sort -u > R1_50bp_ids_2 cat blast_out_R2_2 | awk '{ if ($4 >= 50) if ($3 >= 95) print $1"!"$2}' | sort -u > R2_50bp_ids_2 cat blast_out_R1_2 | awk '{print $1}' | sort -u > sorted_blast_out_R1_2 join sorted_blast_out_R1_2 ~/tamsyn_roach/target_gene_assembly/sorted_read1.fasta.text -t '!' > R1_blast_hits.fasta.text_2 cat blast_out_R2_2 | awk '{print $1}' | sort -u > sorted_blast_out_R2_2 join sorted_blast_out_R2_2 ~/tamsyn_roach/target_gene_assembly/sorted_read2.fasta.text -t '!' > R2_blast_hits.fasta.text_2 join R1_50bp_ids_2 R1_blast_hits.fasta.text_2 -t '!' | tr '!' '\t' | awk '{print ">"$1"_1-"$2"!"$3}' | sort -u > good_R1_blast_hits.text_2 join R2_50bp_ids_2 R2_blast_hits.fasta.text_2 -t '!' | tr '!' '\t' | awk '{print ">"$1"_1-"$2"!"$3}' | sort -u > good_R2_blast_hits.text_2 cat good_R1_blast_hits.text_2 good_R2_blast_hits.text_2 > all_good_blast_hits.text_2 for line in $(cat gene_names.txt) do cat all_good_blast_hits.text_2| grep -w $line | tr '!' '\n' > $line/cap2.fasta cd $line cat cap.fasta cap2.fasta > all_cap.fasta cap3 all_cap.fasta > cap3_out2 cat all_cap.fasta.cap.contigs | sed "s/Contig/$line-c/g" > $line-contigs2.fasta cd .. done ################CHECKING ASSEMBLED CONTIGS AND EXTRACTING USEFUL INFO##################################### for line in $(cat gene_names.txt) do cd $line print $line formatdb -i $line.fasta -p F blastall -p blastn -d $line.fasta -i $line-contigs2.fasta -m 8 -e 1e-5 | awk '{ if ($3 >= 95.0) print $1"\t"$9"\t"$10}' | awk '{ if ($2>$3) print ">"$1"_"$3"_*"$2"!" }'| sort -k 1,1 -t '_' -u > reverse_contig_ids blastall -p blastn -d $line.fasta -i $line-contigs2.fasta -m 8 -e 1e-5 | awk '{ if ($3 >= 95.0) print $1"\t"$9"\t"$10}' | awk '{ if ($3>$2) print ">"$1"_"$2"_"$3"!" }'| sort -k 1,1 -t '_' -u > forward_contig_ids fasta_formatter -i $line-contigs2.fasta | sed '/^$/d'| tr '\n' '_' | sed 's/_>/£>/g' | tr '£' '\n' | sed '/^$/d' | sort -k 1,1 -t '_' > cap.contigs.text join forward_contig_ids cap.contigs.text -t '_'| sed 's/!_/!/g' > forward_cap_contigs.text join reverse_contig_ids cap.contigs.text -t '_' | sed 's/!_/!/g' | sed 's/A_/A/' | sed 's/T_/T/'| sed 's/G_/G/' | sed 's/C_/C/' | tr '!' '\n' | fastx_reverse_complement | tr '\n' '!'| sed 's/!>/£>/g' | tr '£' '\n' | sed '/^$/d' > rc_reverse_cap_contigs.text cat forward_cap_contigs.text rc_reverse_cap_contigs.text | sort -n -t '_' -k 2,2 | tr '_' '!' | awk -F '!' '{print $1"\n"$4}' > $line-final_contigs.fasta awk '/^>/ {print; next; } { seqlen = length($0); print seqlen}' $line-final_contigs.fasta | tr '\n' '!' | sed 's/!>/£>/g' | tr '£' '\n' | sed '/^$/d' > $line-final_contig_lengths blastall -p blastn -d $line.fasta -i $line-final_contigs.fasta -m 8 -e 1e-5 | awk '{print ">"$1"!"$2"!"$9"!"$10"!"$7"!"$8}'> $line-final_contig_alignments join $line-final_contig_lengths $line-final_contig_alignments -t '!' | tr '!' '\t' | sort -n -k 4,4 | sort --stable -k 3,3 | sed '1i\contig_name\tcontig_length\ttarget\ttarget_start\ttarget_end\tcontig_start\tcontig_end' > $line-final_contig_info.text cd .. done rm -f cap* forward* reverse* rc* *alignments *lengths format*