bam.h |
BAM library provides I/O and various operations on manipulating files
in the BAM (Binary Alignment/Mapping) or SAM (Sequence Alignment/Map)
format. It now supports importing from or exporting to TAM, sorting,
merging, generating pileup, and quickly retrieval of reads overlapped
with a specified region.
sam_open |
Open a SAM file for reading, either uncompressed or compressed by gzip/zlib.
tamFile sam_open( const char *fn);
fn
SAM file handler
sam_close |
Close a SAM file handler
void sam_close( tamFile fp);
fp
sam_read1 |
Read one alignment from a SAM file handler
int sam_read1( tamFile fp, bam_header_t *header, bam1_t *b);
fp
header
b
0 if successful; otherwise negative
sam_header_read2 |
Read header information from a TAB-delimited list file.
bam_header_t *sam_header_read2( const char *fn_list);
fn_list
a pointer to the header structure
Each line in this file consists of chromosome name and the length of chromosome.
sam_header_read |
Read header from a SAM file (if present)
bam_header_t *sam_header_read( tamFile fp);
fp
pointer to header struct; 0 if no @SQ lines available
sam_header_parse |
Parse @SQ lines a update a header struct
int sam_header_parse( bam_header_t *h);
h
number of target sequences
bam_header_t::{n_targets,target_len,target_name} will be destroyed in the first place.
sam_header_parse_rg |
Parse @RG lines a update a header struct
int sam_header_parse_rg( bam_header_t *h);
h
number of @RG lines
bam_header_t::rg2lib will be destroyed in the first place.
bam_header_init |
Initialize a header structure.
bam_header_t *bam_header_init();
the pointer to the header structure
This function also modifies the global variable bam_is_be.
bam_header_destroy |
Destroy a header structure.
void bam_header_destroy( bam_header_t *header);
header
bam_header_read |
Read a header structure from BAM.
bam_header_t *bam_header_read( bamFile fp);
fp
pointer to the header structure
The file position indicator must be placed at the beginning of the file. Upon success, the position indicator will be set at the start of the first alignment.
bam_header_write |
Write a header structure to BAM.
int bam_header_write( bamFile fp, const bam_header_t *header);
fp
header
always 0 currently
bam_read1 |
Read an alignment from BAM.
int bam_read1( bamFile fp, bam1_t *b);
fp
b
number of bytes read from the file
The file position indicator must be placed right before an alignment. Upon success, this function will set the position indicator to the start of the next alignment. This function is not affected by the machine endianness.
bam_write1_core |
Write an alignment to BAM.
int bam_write1_core( bamFile fp, const bam1_core_t *c, int data_len, uint8_t *data);
fp
c
data_len
data
number of bytes written to the file
This function is not affected by the machine endianness.
bam_write1 |
It is equivalent to: bam_write1_core(fp, &b->core, b->data_len, b->data)
int bam_write1( bamFile fp, const bam1_t *b);
fp
b
number of bytes written to the file
bam_format1 |
Format a BAM record in the SAM format
char *bam_format1( const bam_header_t *header, const bam1_t *b);
header
b
a pointer to the SAM string
bam_plbuf_reset |
Reset a pileup buffer for another pileup process
void bam_plbuf_reset( bam_plbuf_t *buf);
buf
bam_plbuf_init |
Initialize a buffer for pileup.
bam_plbuf_t *bam_plbuf_init( bam_pileup_f func, void *data);
func
data
pointer to the pileup buffer
bam_plbuf_destroy |
Destroy a pileup buffer.
void bam_plbuf_destroy( bam_plbuf_t *buf);
buf
bam_plbuf_push |
Push an alignment to the pileup buffer.
See:
- bam_plbuf_init()" -->
int bam_plbuf_push( const bam1_t *b, bam_plbuf_t *buf);
b
buf
always 0 currently
If all the alignments covering a particular site have
been collected, this function will call the user defined function
as is provided to bam_plbuf_init(). The coordinate of the site and
all the alignments will be transferred to the user defined
function as function parameters.
When all the alignments are pushed to the buffer, this function
needs to be called with b equal to NULL. This will flush the
buffer. A pileup buffer can only be reused when bam_plbuf_reset()
is called.
bam_lplbuf_init |
bam_plbuf_init() equivalent with level calculated.
bam_lplbuf_t *bam_lplbuf_init( bam_pileup_f func, void *data);
bam_lplbuf_destroy |
bam_plbuf_destroy() equivalent with level calculated.
void bam_lplbuf_destroy( bam_lplbuf_t *tv);
bam_lplbuf_push |
bam_plbuf_push() equivalent with level calculated.
int bam_lplbuf_push( const bam1_t *b, bam_lplbuf_t *buf);
bam_lpileup_file |
bam_plbuf_file() equivalent with level calculated.
int bam_lpileup_file( bamFile fp, int mask, bam_pileup_f func, void *func_data);
bam_index_build |
Build index for a BAM file.
int bam_index_build( const char *fn);
fn
always 0 currently
Index file "fn.bai" will be created.
bam_index_load |
Load index from file "fn.bai".
bam_index_t *bam_index_load( const char *fn);
fn
pointer to the index structure
bam_index_destroy |
Destroy an index structure.
void bam_index_destroy( bam_index_t *idx);
idx
bam_fetch |
Retrieve the alignments that are overlapped with the specified region.
int bam_fetch( bamFile fp, const bam_index_t *idx, int tid, int beg, int end, void *data, bam_fetch_f func);
fp
idx
tid
beg
end
data
func
A user defined function will be called for each
retrieved alignment ordered by its start position.
bam_parse_region |
Parse a region in the format: "chr2:100,000-200,000".
void bam_parse_region( bam_header_t *header, const char *str, int *ref_id, int *begin, int *end);
header
str
ref_id
begin
end
bam_header_t::hash will be initialized if empty.
bam_aux_get |
Retrieve data of a tag
uint8_t *bam_aux_get( const bam1_t *b, const char tag[2]);
b
tag
pointer to the type and data. The first character is the
type that can be 'iIsScCdfAZH'.
Use bam_aux2?() series to convert the returned data to the corresponding type.
bam_calend |
Calculate the rightmost coordinate of an alignment on the reference genome.
uint32_t bam_calend( const bam1_core_t *c, const uint32_t *cigar);
c
cigar
the rightmost coordinate, 0-based
bam_cigar2qlen |
Calculate the length of the query sequence from CIGAR.
int32_t bam_cigar2qlen( const bam1_core_t *c, const uint32_t *cigar);
c
cigar
length of the query sequence
bam_reg2bin |
Calculate the minimum bin that contains a region [beg,end).
static inline int bam_reg2bin( uint32_t beg, uint32_t end)
beg
end
bin
bam_copy1 |
Copy an alignment
static inline bam1_t *bam_copy1( bam1_t *bdst, const bam1_t *bsrc)
bdst
bsrc
pointer to the destination alignment struct
bam_dup1 |
Duplicate an alignment
static inline bam1_t *bam_dup1( const bam1_t *src)
src
pointer to the destination alignment struct
bamFile |
BAM file handler
typedef RAZF *bamFile;
bamFile |
BAM file handler
typedef BGZF *bamFile;
bamFile |
BAM file handler
typedef RAZF *bamFile;
bam_header_t |
Structure for the alignment header.
typedef struct { int32_t n_targets; char **target_name; uint32_t *target_len; void *hash, *rg2lib; int l_text; char *text; } bam_header_t;
n_targets
- number of reference sequences
target_name
- names of the reference sequences
target_len
- lengths of the referene sequences
hash
- hash table for fast name lookup
rg2lib
- hash table for @RG-ID -> LB lookup
l_text
- length of the plain text in the header
text
- plain text
Field hash points to null by default. It is a private member.
bam1_core_t |
Structure for core alignment information.
typedef struct { int32_t tid; int32_t pos; uint32_t bin:16, qual:8, l_qname:8; uint32_t flag:16, n_cigar:16; int32_t l_qseq; int32_t mtid; int32_t mpos; int32_t isize; } bam1_core_t;
tid
- chromosome ID, defined by bam_header_t
pos
- 0-based leftmost coordinate
strand
- strand; 0 for forward and 1 otherwise
bin
- bin calculated by bam_reg2bin()
qual
- mapping quality
l_qname
- length of the query name
flag
- bitwise flag
n_cigar
- number of CIGAR operations
l_qseq
- length of the query sequence (read)
bam1_t |
Structure for one alignment.
typedef struct { bam1_core_t core; int l_aux, data_len, m_data; uint8_t *data; } bam1_t;
core
- core information about the alignment
l_aux
- length of auxiliary data
data_len
- current length of bam1_t::data
m_data
- maximum length of bam1_t::data
data
- all variable-length data, concatenated; structure: cigar-qname-seq-qual-aux
Notes:
tamFile |
TAM file handler
typedef struct __tamFile_t *tamFile;
bam_pileup1_t |
Structure for one alignment covering the pileup position.
typedef struct { bam1_t *b; int32_t qpos; int indel, level; uint32_t is_del:1, is_head:1, is_tail:1; } bam_pileup1_t;
b
- pointer to the alignment
qpos
- position of the read base at the pileup site, 0-based
indel
- indel length; 0 for no indel, positive for ins and negative for del
is_del
- 1 iff the base on the padded read is a deletion
level
- the level of the read in the "viewer" mode
See also bam_plbuf_push() and bam_lplbuf_push(). The difference between the two functions is that the former does not set bam_pileup1_t::level, while the later does. Level helps the implementation of alignment viewers, but calculating this has some overhead.
bam_plbuf_t |
pileup buffer
typedef struct __bam_plbuf_t bam_plbuf_t;
bam_pileup_f |
Type of function to be called by bam_plbuf_push().
typedef int ( *bam_pileup_f)( uint32_t tid, uint32_t pos, int n, const bam_pileup1_t *pl, void *data);
tid
- chromosome ID as is defined in the header
pos
- start coordinate of the alignment, 0-based
n
- number of elements in pl array
pl
- array of alignments
data
- user provided data
See also bam_plbuf_push(), bam_plbuf_init() and bam_pileup1_t.
bam_fetch_f |
Type of function to be called by bam_fetch().
typedef int ( *bam_fetch_f)( const bam1_t *b, void *data);
b
- the alignment
data
- user provided data
bam_is_be |
Whether the machine is big-endian; modified only in bam_header_init().
extern int bam_is_be;
bam_nt16_table |
Table for converting a nucleotide character to the 4-bit encoding.
extern unsigned char bam_nt16_table[256];
bam_nt16_rev_table |
Table for converting a 4-bit encoded nucleotide to a letter.
extern char *bam_nt16_rev_table;
BAM_FPAIRED |
the read is paired in sequencing, no matter whether it is mapped in a pair
#define BAM_FPAIRED 1
BAM_FPROPER_PAIR |
the read is mapped in a proper pair
#define BAM_FPROPER_PAIR 2
BAM_FUNMAP |
the read itself is unmapped; conflictive with BAM_FPROPER_PAIR
#define BAM_FUNMAP 4
BAM_FMUNMAP |
the mate is unmapped
#define BAM_FMUNMAP 8
BAM_FREVERSE |
the read is mapped to the reverse strand
#define BAM_FREVERSE 16
BAM_FMREVERSE |
the mate is mapped to the reverse strand
#define BAM_FMREVERSE 32
BAM_FREAD1 |
this is read1
#define BAM_FREAD1 64
BAM_FREAD2 |
this is read2
#define BAM_FREAD2 128
BAM_FSECONDARY |
not primary alignment
#define BAM_FSECONDARY 256
BAM_FQCFAIL |
QC failure
#define BAM_FQCFAIL 512
BAM_FDUP |
optical or PCR duplicate
#define BAM_FDUP 1024
BAM_DEF_MASK |
defautl mask for pileup
#define BAM_DEF_MASK
BAM_CMATCH |
CIGAR: match
#define BAM_CMATCH 0
BAM_CINS |
CIGAR: insertion to the reference
#define BAM_CINS 1
BAM_CDEL |
CIGAR: deletion from the reference
#define BAM_CDEL 2
BAM_CREF_SKIP |
CIGAR: skip on the reference (e.g. spliced alignment)
#define BAM_CREF_SKIP 3
BAM_CSOFT_CLIP |
CIGAR: clip on the read with clipped sequence present in qseq
#define BAM_CSOFT_CLIP 4
BAM_CHARD_CLIP |
CIGAR: clip on the read with clipped sequence trimmed off
#define BAM_CHARD_CLIP 5
BAM_CPAD |
CIGAR: padding
#define BAM_CPAD 6
bam1_cigar |
Get the CIGAR array
#define bam1_cigar(b)
b
pointer to the CIGAR array
In the CIGAR array, each element is a 32-bit integer. The lower 4 bits gives a CIGAR operation and the higher 28 bits keep the length of a CIGAR.
bam1_qname |
Get the name of the query
#define bam1_qname(b)
b
pointer to the name string, null terminated
bam1_seq |
Get query sequence
#define bam1_seq(b)
b
pointer to sequence
Each base is encoded in 4 bits: 1 for A, 2 for C, 4 for G, 8 for T and 15 for N. Two bases are packed in one byte with the base at the higher 4 bits having smaller coordinate on the read. It is recommended to use bam1_seqi() macro to get the base.
bam1_qual |
Get query quality
#define bam1_qual(b)
b
pointer to quality string
bam1_seqi |
Get a base on read
#define bam1_seqi(s, i)
s
i
4-bit integer representing the base.
bam1_aux |
Get query sequence and quality
#define bam1_aux(b)
b
pointer to the concatenated auxiliary data
kroundup32 |
Round an integer to the next closest power-2 integer.
#define kroundup32(x)
x
x will be modified.
bam_init1 |
Initiate a pointer to bam1_t struct
#define bam_init1()
bam_destroy1 |
Free the memory allocated for an alignment.
#define bam_destroy1(b) do { \ free((b)->data); free(b); \ } while (0)
b
© Genome Research Ltd. Last Updated: Saturday, June 13, 2009