Skip to content

Commit

Permalink
Merge branch 'dev'
Browse files Browse the repository at this point in the history
* dev: (123 commits)
  update changelog and readme
  fix single-end inpipe error
  kill input process when sdbg_build meets errors
  update megahit script: delete in pipe files when builder report errors
  reset default k to k-step 20, look good enough
  update megahit script
  fix local assembly when read length > 200
  rename toolkits -> toolkit
  trim at N when reading reads
  add bz2 support (via mkfifo)
  add k-list support; try spades' list
  implement sequence package's reserve functions
  put assemble, local_assemble and iter into one program megahit_asm_core
  update megahit script logging
  fix min final standalone miscalculation
  add megahit toolkit assembler's min standalone length set to 3*k_max-1
  formatted by astyle
  bug fixes && makefile update
  re-add --input-cmd option
  output to final.contigs.fa when --no-local is checked (faster)
  ...

Conflicts:
	megahit
  • Loading branch information
voutcn committed Jun 18, 2015
2 parents 03cbdcb + 14b8ccc commit 83ad6ff
Show file tree
Hide file tree
Showing 111 changed files with 17,650 additions and 8,699 deletions.
5 changes: 1 addition & 4 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -29,8 +29,5 @@
# Sublime Text file
sftp-config.json
megahit_assemble
megahit_iter_k124
megahit_iter_k61
megahit_iter_k92
megahit_iter
sdbg_builder_cpu
test/*
12 changes: 12 additions & 0 deletions ChangeLog.md
Original file line number Diff line number Diff line change
@@ -1,3 +1,15 @@
### 0.3.0-beta / 2015-06-18 HKT

New features:

* `--max-read-len` parameter no longer required
* `--memory` option set to 0.9 by default
* make use of PE informations (with local assembly)
* `--prune-level` and `--merge-level` for setting pruning and merging intensity
* `--kmin-1pass` option for assembling ultra low-coverage datasets in less memory
* supporting bzip2 input files
* useful tools in `megahit_toolkit`, including contig2fastg for conversion of contig files into SPAdes-like fastg

### 0.2.1 / 2015-03-18
Bug Fixes:

Expand Down
126 changes: 84 additions & 42 deletions Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -51,7 +51,7 @@ COMMA = ,
ifdef sm
SM_ARCH = $(subst $(COMMA),-,$(sm))
else
SM_ARCH = 300
SM_ARCH = 350
endif

# Only one arch per tuning binary
Expand Down Expand Up @@ -145,62 +145,94 @@ DEPS = ./Makefile \
# g++ and its options
#-------------------------------------------------------------------------------
CUDALIBFLAG = -L/usr/local/cuda/lib64/ -lcuda -lcudart
CFLAGS = -O3 -Wall -funroll-loops -fprefetch-loop-arrays -fopenmp -std=c++0x -static-libgcc -lm
ZLIB = -lz
GCC_VER := $(shell echo `$(CXX) -dumpversion | cut -f1-2 -d.`)

CXXFLAGS = -O2 -Wall -Wno-unused-function -Wno-array-bounds -D__STDC_FORMAT_MACROS -funroll-loops -fprefetch-loop-arrays -fopenmp -I. -std=c++0x -static-libgcc
LIB = -lm -lz -lpthread

ifeq "4.5" "$(word 1, $(sort 4.5 $(GCC_VER)))"
CXXFLAGS += -static-libstdc++
endif

ifneq ($(disablempopcnt), 1)
CFLAGS += -mpopcnt
CXXFLAGS += -mpopcnt
endif
DEPS = Makefile
BIN_DIR = ./bin/

#-------------------------------------------------------------------------------
# CPU & GPU version
# standalone headers
#-------------------------------------------------------------------------------
STANDALONE_H = rank_and_select.h kmer_plus.h kmer.h lib_info.h \
bit_operation.h atomic_bit_vector.h functional.h \
khash.h kseq.h pool.h packed_reads.h sequence_package.h \
utils.h mem_file_checker-inl.h read_lib_functions-inl.h \
sdbg_builder_writers.h mac_pthread_barrier.h edge_reader.h \
histgram.h definitions.h lv2_cpu_sort.h

DEPS = Makefile $(STANDALONE_H)

#-------------------------------------------------------------------------------
# CPU & GPU version
#-------------------------------------------------------------------------------
ifeq ($(use_gpu), 1)
all: megahit_assemble megahit_iter_all sdbg_builder_gpu sdbg_builder_cpu
all: megahit_asm_core megahit_sdbg_build_gpu megahit_sdbg_build megahit_toolkit
chmod +x ./megahit
else
all: megahit_assemble megahit_iter_all sdbg_builder_cpu
all: megahit_asm_core megahit_sdbg_build megahit_toolkit
chmod +x ./megahit
endif

#-------------------------------------------------------------------------------
# IDBA library
#-------------------------------------------------------------------------------
LIB_IDBA_DIR = lib_idba
LIB_IDBA = $(LIB_IDBA_DIR)/contig_graph.o
LIB_IDBA += $(LIB_IDBA_DIR)/contig_graph_branch_group.o
LIB_IDBA += $(LIB_IDBA_DIR)/contig_info.o
LIB_IDBA += $(LIB_IDBA_DIR)/hash_graph.o
LIB_IDBA += $(LIB_IDBA_DIR)/sequence.o

#-------------------------------------------------------------------------------
# Tookits
#-------------------------------------------------------------------------------
TOOLS_DIR = tools
TOOLKIT = $(TOOLS_DIR)/toolkit.cpp
TOOLKIT += $(TOOLS_DIR)/contigs_to_fastg.cpp
TOOLKIT += $(TOOLS_DIR)/read_stat.cpp
TOOLKIT += $(TOOLS_DIR)/trim_low_qual_tail.cpp
TOOLKIT += $(TOOLS_DIR)/filter_by_len.cpp

#-------------------------------------------------------------------------------
# CPU objectives
#-------------------------------------------------------------------------------
%.o: %.cpp %.h $(DEPS)
$(CXX) $(CFLAGS) -c $< -o $@
$(CXX) $(CXXFLAGS) -c $< -o $@
%.o: %.cpp $(DEPS)
$(CXX) $(CXXFLAGS) -c $< -o $@

.cx1_functions_cpu.o: cx1_functions.cpp $(DEPS)
$(CXX) $(CFLAGS) -D DISABLE_GPU -c cx1_functions.cpp -o .cx1_functions_cpu.o
#-------------------------------------------------------------------------------
# asm_core objectives
#-------------------------------------------------------------------------------
LIB_ASM = succinct_dbg.o assembly_algorithms.o branch_group.o options_description.o \
unitig_graph.o sequence_manager.o local_assembler.o city.o

#-------------------------------------------------------------------------------
# CPU Applications
#-------------------------------------------------------------------------------
sdbg_builder_cpu: sdbg_builder.cpp .cx1_functions_cpu.o lv2_cpu_sort.h options_description.o $(DEPS)
$(CXX) $(CFLAGS) -D DISABLE_GPU sdbg_builder.cpp .cx1_functions_cpu.o options_description.o $(ZLIB) -o sdbg_builder_cpu

megahit_assemble: assembler.cpp succinct_dbg.o rank_and_select.o assembly_algorithms.o branch_group.o options_description.o unitig_graph.o $(DEPS)
$(CXX) $(CFLAGS) assembler.cpp rank_and_select.o succinct_dbg.o assembly_algorithms.o branch_group.o options_description.o unitig_graph.o $(ZLIB) -o megahit_assemble

megahit_iter_all: megahit_iter_k61 megahit_iter_k92 megahit_iter_k124
megahit_sdbg_build: sdbg_builder.cpp cx1.h lv2_cpu_sort.h cx1_kmer_count.o cx1_read2sdbg_s1.o cx1_read2sdbg_s2.o cx1_seq2sdbg.o options_description.o sequence_manager.o $(DEPS)
$(CXX) $(CXXFLAGS) sdbg_builder.cpp cx1_kmer_count.o options_description.o cx1_read2sdbg_s1.o cx1_read2sdbg_s2.o cx1_seq2sdbg.o sequence_manager.o $(LIB) -o megahit_sdbg_build

megahit_iter_k61: iterate_edges.cpp iterate_edges.h options_description.o $(DEPS)
$(CXX) $(CFLAGS) -D KMER_NUM_UINT64=2 iterate_edges.cpp options_description.o $(ZLIB) -o megahit_iter_k61
megahit_asm_core: $(LIB_ASM) $(LIB_IDBA) asm_core.cpp assembler.cpp local_assemble.cpp iterate_edges.cpp $(DEPS)
$(CXX) $(CXXFLAGS) asm_core.cpp assembler.cpp local_assemble.cpp iterate_edges.cpp $(LIB_IDBA) $(LIB_ASM) $(LIB) -o megahit_asm_core

megahit_iter_k92: iterate_edges.cpp iterate_edges.h options_description.o $(DEPS)
$(CXX) $(CFLAGS) -D KMER_NUM_UINT64=3 iterate_edges.cpp options_description.o $(ZLIB) -o megahit_iter_k92

megahit_iter_k124: iterate_edges.cpp iterate_edges.h options_description.o $(DEPS)
$(CXX) $(CFLAGS) -D KMER_NUM_UINT64=4 iterate_edges.cpp options_description.o $(ZLIB) -o megahit_iter_k124
megahit_toolkit: $(TOOLKIT) $(DEPS)
$(CXX) $(CXXFLAGS) $(TOOLKIT) $(LIB) -o megahit_toolkit

#-------------------------------------------------------------------------------
# Applications for debug usage
#-------------------------------------------------------------------------------
query_sdbg: query_sdbg.cpp succinct_dbg.o rank_and_select.o assembly_algorithms.o branch_group.o unitig_graph.o $(DEPS)
$(CXX) $(CFLAGS) query_sdbg.cpp rank_and_select.o succinct_dbg.o assembly_algorithms.o branch_group.o unitig_graph.o -o query_sdbg
query_sdbg: query_sdbg.cpp succinct_dbg.o rank_and_select.h assembly_algorithms.o branch_group.o unitig_graph.o $(DEPS)
$(CXX) $(CXXFLAGS) query_sdbg.cpp succinct_dbg.o assembly_algorithms.o branch_group.o unitig_graph.o -o query_sdbg

ifeq ($(use_gpu), 1)
#-------------------------------------------------------------------------------
# GPU objectives
#-------------------------------------------------------------------------------
Expand All @@ -209,33 +241,43 @@ ifeq ($(use_gpu), 1)
$(NVCC) $(DEFINES) $(SM_TARGETS) lv2_gpu_functions.cu $(NVCCFLAGS) $(CPU_ARCH) $(INC) $(LIBS) -O3 -DTUNE_ARCH=$(SM_ARCH) -DTUNE_SIZE=$(TUNE_SIZE) -o .lv2_gpu_functions_$(SUFFIX).cpp

# cpp -> o
.lv2_gpu_functions_$(SUFFIX).o: .lv2_gpu_functions_$(SUFFIX).cpp $(DEPS)
$(CXX) $(CFLAGS) -c .lv2_gpu_functions_$(SUFFIX).cpp -o .lv2_gpu_functions_$(SUFFIX).o
lv2_gpu_functions_$(SUFFIX).o: .lv2_gpu_functions_$(SUFFIX).cpp $(DEPS)
$(CXX) $(CXXFLAGS) -c .lv2_gpu_functions_$(SUFFIX).cpp -o lv2_gpu_functions_$(SUFFIX).o

cx1_kmer_count_gpu.o: cx1_kmer_count.cpp $(DEPS)
$(CXX) $(CXXFLAGS) -D USE_GPU -c cx1_kmer_count.cpp -o cx1_kmer_count_gpu.o

.cx1_functions.o: cx1_functions.cpp $(DEPS)
$(CXX) $(CFLAGS) -c cx1_functions.cpp -o .cx1_functions.o
cx1_read2sdbg_s1_gpu.o: cx1_read2sdbg_s1.cpp $(DEPS)
$(CXX) $(CXXFLAGS) -D USE_GPU -c cx1_read2sdbg_s1.cpp -o cx1_read2sdbg_s1_gpu.o

cx1_read2sdbg_s2_gpu.o: cx1_read2sdbg_s2.cpp $(DEPS)
$(CXX) $(CXXFLAGS) -D USE_GPU -c cx1_read2sdbg_s2.cpp -o cx1_read2sdbg_s2_gpu.o

cx1_seq2sdbg_gpu.o: cx1_seq2sdbg.cpp $(DEPS)
$(CXX) $(CXXFLAGS) -D USE_GPU -c cx1_seq2sdbg.cpp -o cx1_seq2sdbg_gpu.o

#-------------------------------------------------------------------------------
# GPU Applications
#-------------------------------------------------------------------------------
sdbg_builder_gpu: sdbg_builder.cpp .cx1_functions.o .lv2_gpu_functions_$(SUFFIX).o options_description.o $(DEPS)
$(CXX) $(CFLAGS) $(CUDALIBFLAG) sdbg_builder.cpp .lv2_gpu_functions_$(SUFFIX).o .cx1_functions.o options_description.o $(ZLIB) -o sdbg_builder_gpu
endif
megahit_sdbg_build_gpu: sdbg_builder.cpp cx1_kmer_count_gpu.o cx1_read2sdbg_s1_gpu.o cx1_read2sdbg_s2_gpu.o cx1_seq2sdbg_gpu.o lv2_gpu_functions_$(SUFFIX).o options_description.o sequence_manager.o $(DEPS)
$(CXX) $(CXXFLAGS) $(CUDALIBFLAG) -D USE_GPU sdbg_builder.cpp lv2_gpu_functions_$(SUFFIX).o cx1_kmer_count_gpu.o cx1_read2sdbg_s1_gpu.o cx1_read2sdbg_s2_gpu.o cx1_seq2sdbg_gpu.o options_description.o sequence_manager.o $(LIB) -o megahit_sdbg_build_gpu

#-------------------------------------------------------------------------------
# Build binary directory
#-------------------------------------------------------------------------------

.PHONY:
test: megahit_assemble megahit_iter_all sdbg_builder_cpu
test: megahit_asm_core megahit_sdbg_build megahit_toolkit
-rm -fr example/megahit_out
./megahit -m 0.9 -l 100 -r example/readsInterleaved.fa -o example/megahit_out
./megahit --12 example/readsInterleaved1.fa.gz,example/readsInterleaved2.fa.bz2,example/readsInterleaved3.fa -o example/megahit_out -t 4

test_gpu: megahit_assemble megahit_iter_all sdbg_builder_gpu
test_gpu: megahit_asm_core megahit_sdbg_build_gpu megahit_toolkit
-rm -fr example/megahit_gpu_out
./megahit -m 0.9 -l 100 -r example/readsInterleaved.fa --use-gpu -o example/megahit_gpu_out
./megahit --12 example/readsInterleaved1.fa.gz,example/readsInterleaved2.fa.bz2,example/readsInterleaved3.fa --use-gpu -o example/megahit_gpu_out -t 4

.PHONY:
clean:
-rm -fr *.i* *.cubin *.cu.c *.cudafe* *.fatbin.c *.ptx *.hash *.cu.cpp *.o .*.cpp \
example/megahit_*out
-rm -fr *.i* *.cubin *.cu.c *.cudafe* *.fatbin.c *.ptx *.hash *.cu.cpp *.o .*.o .*.cpp \
$(LIB_IDBA) \
example/megahit_*out \
megahit_asm_core megahit_sdbg_build megahit_sdbg_build_gpu megahit_toolkit
54 changes: 24 additions & 30 deletions README.md
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
MEGAHIT
=========

MEGAHIT is a single node assembler for large and complex metagenomics NGS reads, such as soil. It makes use of succinct *de Bruijn* graph (SdBG) to achieve low memory assembly. The graph construction algorithm can self-adjust to use all available or moderate memory, and can be accelerated if a CUDA-enable GPU is provided. The GPU-accelerated version of MEGAHIT has been tested on NVIDIA GTX680 (4G memory) and Tesla K40c (12G memory).
MEGAHIT is a single node assembler for large and complex metagenomics NGS reads, such as soil. It makes use of succinct *de Bruijn* graph (SdBG) to achieve low memory assembly. The graph construction algorithm can self-adjust to use all available or moderate memory, and can be accelerated if a CUDA-enable GPU is provided. The GPU-accelerated version of MEGAHIT has been tested on NVIDIA GTX680 (4G memory) and Tesla K40c (12G memory) with CUDA 5.5, 6.0 and 6.5.

[![Build Status](https://travis-ci.org/voutcn/megahit.svg)](https://travis-ci.org/voutcn/megahit)
[![Build Status](https://drone.io/github.com/voutcn/megahit/status.png)](https://drone.io/github.com/voutcn/megahit/latest)
Expand All @@ -28,43 +28,36 @@ Please refer to the [release page](https://github.com/voutcn/megahit/releases).
If MEGAHIT is successfully compiled, it can be run by the following command:

```
./megahit [options] -m <max_memory_to_use> -l <max_read_len> {-r <reads.fa> | --input_cmd <command>}
./megahit [options] {-r <se.fq> | -1 <pe1.fq> -2 <pe2.fq> | --12 <pe12.fq> }
```

or type `make test` in MEGAHIT's source directory for a quick test and `./megahit -h` for the usage message.
or type `make test` in MEGAHIT's source directory for a quick test and `./megahit -h` for usage message.

If you need install MEGAHIT to another directory after compilation, please copy `megahit`, `megahit_asm_core` and `megahit_sdbg_build` (and `megahit_sdbg_build_gpu`) to the destination.

### Using GPU Version
To use the GPU version, run `make use_gpu=1` to compile MEGAHIT, and run MEGAHIT with `--use-gpu`. GPU version has only been tested in Linux.


Memory Setting
----------------
Users are requried to set a memory parameter `-m` for MEGAHIT. This parameter specifies the maximum memory that can be used by the SdBG constrution component of MEGAHIT. If 0 < `-m` < 1, the parameter specifies the fraction of the machine's total memory to be used; if `-m` >= 1, it specifies the memory in BYTE to be used. `--mem-flag` is another option for memory control.

### Quick recommendation
Set the `-m` parameter to be 90-95% of the available memory (e.g. `-m 0.9`) and leave the `--mem-flag` default.

### Detail explanation
Please refer to [this wiki page](https://github.com/voutcn/megahit/wiki/MEGAHIT-Memory-setting).


Input Files
--------------

MEGAHIT accepts one fasta or fastq file as input. The input file can be gzip'ed. Alternatively, you can use the option `--input-cmd` to input reads from multiple files. Following the `--input-cmd` should be a command that outputs all reads to `STDOUT` in fasta or fastq format. A mix of fasta and fastq is also supported from version 0.2.0. Currently pair-end information is not used by MEGAHIT. Therefore pair-end files can be input to MEGAHIT as multiple single-end files. Some examples are shown on [this wiki page](https://github.com/voutcn/megahit/wiki/Input-examples).

Options
Assembly Tips
------------------------
###Choosing *k*
MEGAHIT uses multiple *k*-mer strategy. Minimum *k*, maximum *k* and the step for iteration can be set by options `--k-min`, `--k-max` and `--k-step` respectively. *k* must be odd numbers while the step must be an even number.

For ultra complex metagenomics data such as soil, a larger *k<sub>min</sub>*, say 27, is recommended to reduce the complexity of the *de Bruijn* graph. Quality trimming is also recommended.
MEGAHIT uses multiple *k*-mer strategy. Minimum *k*, maximum *k* and the step for iteration can be set by options `--k-min`, `--k-max` and `--k-step` respectively. *k* must be odd numbers while the step must be an even number.
* For ultra complex metagenomics data such as soil, a larger *k<sub>min</sub>*, say 27, is recommended to reduce the complexity of the *de Bruijn* graph. Quality trimming is also recommended
* For high-depth generic data, large `--k-min` (25 to 31) is recommended
* Smaller `--k-step`, say 10, is more friendly to low-coverage datasets

###Filtering (*k<sub>min</sub>*+1)-mer
(*k<sub>min</sub>*+1)-mer with multiplicity lower than *d* (default 2, specified by `--min-count` option) will be discarded. You should be cautious to set *d* less than 2, which will lead to a much larger and noisy graph. We recommend using the default value 2 for metagenomics assembly. If you want to use MEGAHIT to do generic assembly, please change this value according to the sequencing depth.
(*k<sub>min</sub>*+1)-mer with multiplicity lower than *d* (default 2, specified by `--min-count` option) will be discarded. You should be cautious to set *d* less than 2, which will lead to a much larger and noisy graph. We recommend using the default value 2 for metagenomics assembly. If you want to use MEGAHIT to do generic assembly, please change this value according to the sequencing depth. (recommend `--min-count 3` for >40x).

###Mercy *k*-mer
This is specially designed for metagenomics assembly to recover low coverage sequence. You can disable it with `--no-mercy` option.
This is specially designed for metagenomics assembly to recover low coverage sequence. For generic dataset >= 30x, MEGAHIT may generate better results with `--no-mercy` option.

###*k*-min 1pass mode
This mode can be activated by option `--kmin-1pass`. It is more memory efficient for ultra low-depth dataset, such as soil metagenomics data.

FAQ
-----------------------
Please refer to [our wiki](https://github.com/voutcn/megahit/wiki).

Reporting Issues
-----------------------
Expand All @@ -76,8 +69,9 @@ Citing MEGAHIT

License
-----------------------
MEGAHIT is released under GPLv3. Several third-party libs are used, including:
MEGAHIT is released under GPLv3. Several third-party codes are used, including:

* [CUB](https://github.com/NVlabs/cub) under "New BSD"" license
* kseq.h in [klib](https://github.com/attractivechaos/klib) under MIT license
* hash_{table, set, map}.h in [IDBA package](http://i.cs.hku.hk/~alse/hkubrg/projects/idba/) under GPLv2
* [CUB](https://github.com/NVlabs/cub) under "New BSD" license
* [klib](https://github.com/attractivechaos/klib) under MIT license
* [IDBA package](http://i.cs.hku.hk/~alse/hkubrg/projects/idba/) under GPLv2
* [CityHash](https://code.google.com/p/cityhash/) under MIT license
60 changes: 60 additions & 0 deletions asm_core.cpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,60 @@
/*
* MEGAHIT
* Copyright (C) 2014 - 2015 The University of Hong Kong
*
* This program is free software: you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation, either version 3 of the License, or
* (at your option) any later version.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with this program. If not, see <http://www.gnu.org/licenses/>.
*/

/* contact: Dinghua Li <[email protected]> */

#include <stdio.h>
#include <stdlib.h>
#include <string.h>

#include "utils.h"

int main_assemble(int argc, char** argv);
int main_local(int argc, char **argv);
int main_iterate(int argc, char **argv);

void show_help(const char *program_name) {
fprintf(stderr, "Usage: %s <sub_program> [sub options]\n"
" sub-programs:\n"
" assemble assemble from SdBG\n"
" local local asssembly\n"
" iterate extract iterative edges\n",
program_name);
}

AutoMaxRssRecorder recorder;

int main(int argc, char **argv) {
if (argc < 2) {
show_help(argv[0]);
exit(1);
}

if (strcmp(argv[1], "assemble") == 0) {
return main_assemble(argc - 1, argv + 1);
} else if (strcmp(argv[1], "local") == 0) {
return main_local(argc - 1 , argv + 1);
} else if (strcmp(argv[1], "iterate") == 0) {
return main_iterate(argc - 1, argv + 1);
} else {
show_help(argv[0]);
exit(1);
}

return 0;
}
Loading

0 comments on commit 83ad6ff

Please sign in to comment.