#运行Ubuntu之前，先在电脑D盘创建目录win16s，并将样本测序数据文件(raw_data.zip)，mapping.txt与qiime_params.txt放在此目录下。
#此流程所有命令都需在WSL-Ubuntu的bash终端下运行（以下命令前面的“$”代表bash提示符，不需要输入）。

#Navigate to your working directory
$cd /mnt/d/win16s/

#准备raw reads ~This pipeline assumes that you have an input folder of paired-end reads files, with the default "_R1" and "_R2" containing the forward and reverse reads, respectively.
$unzip raw_data.zip
$ls raw_data  #显示文件列表，如SAA_R1.fastq.gz, SAA_R2.fastq.gz ...，如果文件名不对，需要重新命名

#1.验证mapping file ~ 按样本名称修改mapping.txt，可忽略Barcode与Primer序列为空的错误
validate_mapping_file.py -m mapping.txt -o validate_map

#2.合并双端序列
$mdir  join_pe_reads
#$vsearch --fastq_mergepairs raw_data/SAA_R1.fastq.gz --reverse raw_data/SAA_R2.fastq.gz --fastqout join_pe_reads/SAA.fq
#多个样品数据
$for i in `tail -n+2 mapping.txt |cut -f 1`; do vsearch --fastq_mergepairs raw_data/${i}_R1.fastq.gz --reverse raw_data/${i}_R2.fastq.gz --fastqout join_pe_reads/${i}.fq ; done

#3.序列质量控制 ~ Using FastQC to check read length firstly, reads with length < 380 are filtered
$mkdir qual_filter
#$vsearch --fastx_filter join_pe_reads/SAA.fq --fastq_maxee 1.0 --fastq_minlen 380 --fastqout join_pe_reads/SAA_filter.fq
#多样品数据，直接copy命令到terminal执行
for i in `tail -n+2 mapping.txt |cut -f 1`; do vsearch --fastx_filter join_pe_reads/${i}.fq --fastq_maxee 1.0 --fastq_minlen 380 --fastqout qual_filter/${i}.fq ; done

#4.split libraries
$mkdir split_out
$multiple_split_libraries_fastq.py -i qual_filter/ -o split_out/ --demultiplexing_method sampleid_by_file

#5.Dereplication(序列去重复)
vsearch --derep_fulllength split_out/seqs.fna --output split_out/seqs_derep.fa --sizeout --minuniquesize 2

#6.嵌合体检测
#$vsearch --uchime_ref split_out/seqs_derep.fa --db rdp_gold.fa --nonchimeras seqs_nochimeras.fa #需要参考序列文件
$vsearch --uchime_denovo split_out/seqs_derep.fa --nonchimeras seqs_nochimeras.fa

#7.OTU clustering(聚类) ~the longest sequence is picked as the cluster representative.
$vsearch --cluster_fast seqs_nochimeras.fa --id 0.97 --centroids otus.fa --relabel OTU_

#8.Assign taxonomy to OTUs using the RDP classifier on QIIME(分类注释)
$assign_taxonomy.py -i otus.fa -m rdp -o taxonomy
#OTU taxonomy was then determined using the Ribosomal Database Project classifier retrained toward the Greengenes database of 13_8 version

#9.Map reads back to the OTU data
$vsearch --usearch_global split_out/seqs.fna --db otus.fa --strand plus --id 0.97 --otutabout otus_table.txt

#10.Convert otu_table.txt to otu-table.biom
$biom convert -i otus_table.txt -o otus_table.biom --to-hdf5 --table-type "OTU table"
$biom add-metadata -i otus_table.biom -o otus_table_tax.biom --observation-metadata-fp taxonomy/otus_tax_assignments.txt --sc-separated taxonomy --observation-header OTUID,taxonomy

#11.Check OTU Table on QIIME
$biom summarize-table -i otus_table_tax.biom -o summary_biom.txt
#记录数据量最少的样本的reads数(Counts)，用于后续的统计抽样(步骤16中的-e参数）
$biom convert -i otus_table_tax.biom -o otus_table_tsv.txt --table-type "OTU table" --to-tsv --header-key taxonomy
#open otu_table_tsv.txt with Excel to modify any error, then convert back to biom format
$biom convert -i otus_table_tsv.txt  -o otus_table_tsv.biom --table-type "OTU table" --to-hdf5 --table-type="OTU table" --process-obs-metadata taxonomy

#12.可视化 ~要先运行Xming
#Plot relative abundance for all samples
$summarize_taxa_through_plots.py -i otus_table_tax.biom -m mapping.txt -o taxa_summary
#Create a summary for at species level
$summarize_taxa.py -i otus_table_tax.biom -o taxonomy_summary_species -L 7
$plot_taxa_summary.py -i taxonomy_summary_species/otus_table_tax_L7.txt -o taxonomy_summary_species

#13.Align sequences on QIIME using the ClustalW or muscle method
$align_seqs.py -i otus.fa -m clustalw -o rep_set_align

#14.Filter alignments on QIIME
$filter_alignment.py -i rep_set_align/otus_aligned.fasta -o rep_set_align

#15.Make the reference tree on QIIME
$make_phylogeny.py -i rep_set_align/otus_aligned_pfiltered.fasta -o rep_set.tre

#16.Run diversity analyses on QIIME
$core_diversity_analyses.py -i otus_table.biom -m mapping.txt -t rep_set.tre -e 10000 -o core_diversity -p qiime_params.txt
#The parameter “-e 10000” is the sequencing depth to use for even sub-sampling and maximum rarefaction depth. You should review the output of the 'biom summarize-table'  command (summary_biom.txt)to decide on this value.

#17.Making PCoA plots
$make_2d_plots.py -i core_diversity/bdiv_even10000/weighted_unifrac_pc.txt -o pcoa_plot -m mapping.txt -b Group
#-i参数后输入文件所在目录的数字（10000）会随前面命令的-e参数变化

#18.统计两组间的差异：
$compare_categories.py --method anosim -i core_diversity/bdiv_even10000/weighted_unifrac_dm.txt -m mapping.txt -c Group -o anosim_out -n 99