25#ifdef __ZLIB_AVAILABLE__
37#define _bgzf_open(fn, mode) knet_open(fn, mode)
38#define _bgzf_dopen(fp, mode) knet_dopen(fp, mode)
39#define _bgzf_close(fp) knet_close(fp)
40#define _bgzf_fileno(fp) ((fp)->fd)
41#define _bgzf_tell(fp) knet_tell(fp)
42#define _bgzf_seek(fp, offset, whence) knet_seek(fp, offset, whence)
43#define _bgzf_read(fp, buf, len) knet_read(fp, buf, len)
44#define _bgzf_write(fp, buf, len) knet_write(fp, buf, len)
46#if defined(_WIN32) || defined(_MSC_VER)
47#define ftello(fp) ftell(fp)
48#define fseeko(fp, offset, whence) fseek(fp, offset, whence)
50extern off_t ftello(FILE *stream);
51extern int fseeko(FILE *stream, off_t offset,
int whence);
53typedef FILE *_bgzf_file_t;
54#define _bgzf_open(fn, mode) fopen(fn, mode)
55#define _bgzf_dopen(fp, mode) fdopen(fp, mode)
56#define _bgzf_close(fp) fclose(fp)
57#define _bgzf_fileno(fp) fileno(fp)
58#define _bgzf_tell(fp) ftello(fp)
59#define _bgzf_seek(fp, offset, whence) fseeko(fp, offset, whence)
60#define _bgzf_read(fp, buf, len) fread(buf, 1, len, fp)
61#define _bgzf_write(fp, buf, len) fwrite(buf, 1, len, fp)
64#define BLOCK_HEADER_LENGTH 18
65#define BLOCK_FOOTER_LENGTH 8
72static const uint8_t g_magic[19] =
"\037\213\010\4\0\0\0\0\0\377\6\0\102\103\2\0\0\0";
81KHASH_MAP_INIT_INT64(cache, cache_t)
84static inline void packInt16(uint8_t *buffer, uint16_t value)
87 buffer[1] = value >> 8;
90static inline int unpackInt16(
const uint8_t *buffer)
92 return buffer[0] | buffer[1] << 8;
95static inline void packInt32(uint8_t *buffer, uint32_t value)
98 buffer[1] = value >> 8;
99 buffer[2] = value >> 16;
100 buffer[3] = value >> 24;
103static BGZF *bgzf_read_init()
106 fp = calloc(1,
sizeof(
BGZF));
108 fp->uncompressed_block = malloc(BGZF_BLOCK_SIZE);
109 fp->compressed_block = malloc(BGZF_BLOCK_SIZE);
111 fp->cache = kh_init(cache);
116static BGZF *bgzf_write_init(
int compress_level)
119 fp = calloc(1,
sizeof(
BGZF));
121 fp->uncompressed_block = malloc(BGZF_BLOCK_SIZE);
122 fp->compressed_block = malloc(BGZF_BLOCK_SIZE);
123 fp->compress_level = compress_level < 0? Z_DEFAULT_COMPRESSION : compress_level;
124 if (fp->compress_level > 9) fp->compress_level = Z_DEFAULT_COMPRESSION;
128static int mode2level(
const char *__restrict mode)
130 int i, compress_level = -1;
131 for (i = 0; mode[i]; ++i)
132 if (mode[i] >=
'0' && mode[i] <=
'9')
break;
133 if (mode[i]) compress_level = (int)mode[i] -
'0';
134 if (strchr(mode,
'u')) compress_level = 0;
135 return compress_level;
138BGZF *bgzf_open(
const char *path,
const char *mode)
141 if (strchr(mode,
'r') || strchr(mode,
'R')) {
143 if ((fpr = _bgzf_open(path,
"r")) == 0)
return 0;
144 fp = bgzf_read_init();
146 }
else if (strchr(mode,
'w') || strchr(mode,
'W')) {
148 if ((fpw = fopen(path,
"w")) == 0)
return 0;
149 fp = bgzf_write_init(mode2level(mode));
151 }
else if (strchr(mode,
'a') || strchr(mode,
'A')) {
153 if ((fpw = fopen(path,
"r+")) == 0)
return 0;
154 fp = bgzf_write_init(mode2level(mode));
157 if(bgzf_check_EOF(fp))
160 _bgzf_seek(fp->fp, -28, SEEK_END);
165 _bgzf_seek(fp->fp, 0, SEEK_END);
171BGZF *bgzf_dopen(
int fd,
const char *mode)
174 if (strchr(mode,
'r') || strchr(mode,
'R')) {
176 if ((fpr = _bgzf_dopen(fd,
"r")) == 0)
return 0;
177 fp = bgzf_read_init();
179 }
else if (strchr(mode,
'w') || strchr(mode,
'W')) {
181 if ((fpw = fdopen(fd,
"w")) == 0)
return 0;
182 fp = bgzf_write_init(mode2level(mode));
189static int deflate_block(
BGZF *fp,
int block_length)
191 uint8_t *buffer = fp->compressed_block;
192 int buffer_size = BGZF_BLOCK_SIZE;
193 int input_length = block_length;
194 int compressed_length = 0;
198 assert(block_length <= BGZF_BLOCK_SIZE);
199 memcpy(buffer, g_magic, BLOCK_HEADER_LENGTH);
205 zs.next_in = fp->uncompressed_block;
206 zs.avail_in = input_length;
207 zs.next_out = (
void*)&buffer[BLOCK_HEADER_LENGTH];
208 zs.avail_out = buffer_size - BLOCK_HEADER_LENGTH - BLOCK_FOOTER_LENGTH;
209 status = deflateInit2(&zs, fp->compress_level, Z_DEFLATED, -15, 8, Z_DEFAULT_STRATEGY);
210 if (status != Z_OK) {
211 fp->errcode |= BGZF_ERR_ZLIB;
214 status = deflate(&zs, Z_FINISH);
215 if (status != Z_STREAM_END) {
217 if (status == Z_OK) {
218 input_length -= 1024;
219 assert(input_length > 0);
222 fp->errcode |= BGZF_ERR_ZLIB;
225 if (deflateEnd(&zs) != Z_OK) {
226 fp->errcode |= BGZF_ERR_ZLIB;
229 compressed_length = zs.total_out;
230 compressed_length += BLOCK_HEADER_LENGTH + BLOCK_FOOTER_LENGTH;
231 assert(compressed_length <= BGZF_BLOCK_SIZE);
235 assert(compressed_length > 0);
236 packInt16((uint8_t*)&buffer[16], compressed_length - 1);
237 crc = crc32(0L, NULL, 0L);
238 crc = crc32(crc, fp->uncompressed_block, input_length);
239 packInt32((uint8_t*)&buffer[compressed_length-8], crc);
240 packInt32((uint8_t*)&buffer[compressed_length-4], input_length);
242 remaining = block_length - input_length;
244 assert(remaining <= input_length);
245 memcpy(fp->uncompressed_block, fp->uncompressed_block + input_length, remaining);
247 fp->block_offset = remaining;
248 return compressed_length;
252static int inflate_block(
BGZF* fp,
int block_length)
257 zs.next_in = fp->compressed_block + 18;
258 zs.avail_in = block_length - 16;
259 zs.next_out = fp->uncompressed_block;
260 zs.avail_out = BGZF_BLOCK_SIZE;
262 if (inflateInit2(&zs, -15) != Z_OK) {
263 fp->errcode |= BGZF_ERR_ZLIB;
266 if (inflate(&zs, Z_FINISH) != Z_STREAM_END) {
268 fp->errcode |= BGZF_ERR_ZLIB;
271 if (inflateEnd(&zs) != Z_OK) {
272 fp->errcode |= BGZF_ERR_ZLIB;
278static int check_header(
const uint8_t *header)
280 return (header[0] == 31 && header[1] == 139 && header[2] == 8 && (header[3] & 4) != 0
281 && unpackInt16((uint8_t*)&header[10]) == 6
282 && header[12] ==
'B' && header[13] ==
'C'
283 && unpackInt16((uint8_t*)&header[14]) == 2);
287static void free_cache(
BGZF *fp)
290 khash_t(cache) *h = (khash_t(cache)*)fp->cache;
291 if (fp->open_mode !=
'r')
return;
292 for (k = kh_begin(h); k < kh_end(h); ++k)
293 if (kh_exist(h, k)) free(kh_val(h, k).block);
294 kh_destroy(cache, h);
297static int load_block_from_cache(
BGZF *fp, int64_t block_address)
301 khash_t(cache) *h = (khash_t(cache)*)fp->cache;
302 k = kh_get(cache, h, block_address);
303 if (k == kh_end(h))
return 0;
305 if (fp->block_length != 0) fp->block_offset = 0;
306 fp->block_address = block_address;
307 fp->block_length = p->size;
308 memcpy(fp->uncompressed_block, p->block, BGZF_BLOCK_SIZE);
309 _bgzf_seek((_bgzf_file_t)fp->fp, p->end_offset, SEEK_SET);
313static void cache_block(
BGZF *fp,
int size)
318 khash_t(cache) *h = (khash_t(cache)*)fp->cache;
319 if (BGZF_BLOCK_SIZE >= fp->cache_size)
return;
320 if ((kh_size(h) + 1) * BGZF_BLOCK_SIZE > fp->cache_size) {
324 for (k = kh_begin(h); k < kh_end(h); ++k)
325 if (kh_exist(h, k))
break;
327 free(kh_val(h, k).block);
331 k = kh_put(cache, h, fp->block_address, &ret);
332 if (ret == 0)
return;
334 p->size = fp->block_length;
335 p->end_offset = fp->block_address + size;
336 p->block = malloc(BGZF_BLOCK_SIZE);
337 memcpy(kh_val(h, k).block, fp->uncompressed_block, BGZF_BLOCK_SIZE);
340static void free_cache(
BGZF *fp) {}
341static int load_block_from_cache(
BGZF *fp, int64_t block_address) {
return 0;}
342static void cache_block(
BGZF *fp,
int size) {}
345int bgzf_read_block(
BGZF *fp)
347 uint8_t header[BLOCK_HEADER_LENGTH], *compressed_block;
348 int count, size = 0, block_length, remaining;
349 int64_t block_address;
350 block_address = _bgzf_tell((_bgzf_file_t)fp->fp);
351 if (load_block_from_cache(fp, block_address))
return 0;
352 count = _bgzf_read(fp->fp, header,
sizeof(header));
354 fp->block_length = 0;
357 if (count !=
sizeof(header) || !check_header(header)) {
358 fp->errcode |= BGZF_ERR_HEADER;
362 block_length = unpackInt16((uint8_t*)&header[16]) + 1;
363 compressed_block = (uint8_t*)fp->compressed_block;
364 memcpy(compressed_block, header, BLOCK_HEADER_LENGTH);
365 remaining = block_length - BLOCK_HEADER_LENGTH;
366 count = _bgzf_read(fp->fp, &compressed_block[BLOCK_HEADER_LENGTH], remaining);
367 if (count != remaining) {
368 fp->errcode |= BGZF_ERR_IO;
372 if ((count = inflate_block(fp, block_length)) < 0)
return -1;
373 if (fp->block_length != 0) fp->block_offset = 0;
374 fp->block_address = block_address;
375 fp->block_length = count;
376 cache_block(fp, size);
380ssize_t bgzf_read(
BGZF *fp,
void *data, ssize_t length)
382 ssize_t bytes_read = 0;
383 uint8_t *output = data;
384 if (length <= 0)
return 0;
385 assert(fp->open_mode ==
'r');
386 while (bytes_read < length) {
387 int copy_length, available = fp->block_length - fp->block_offset;
389 if (available <= 0) {
390 if (bgzf_read_block(fp) != 0)
return -1;
391 available = fp->block_length - fp->block_offset;
392 if (available <= 0)
break;
394 copy_length = length - bytes_read < available? length - bytes_read : available;
395 buffer = fp->uncompressed_block;
396 memcpy(output, buffer + fp->block_offset, copy_length);
397 fp->block_offset += copy_length;
398 output += copy_length;
399 bytes_read += copy_length;
401 if (fp->block_offset == fp->block_length) {
402 fp->block_address = _bgzf_tell((_bgzf_file_t)fp->fp);
403 fp->block_offset = fp->block_length = 0;
408int bgzf_flush(
BGZF *fp)
410 assert(fp->open_mode ==
'w');
411 while (fp->block_offset > 0) {
413 block_length = deflate_block(fp, fp->block_offset);
414 if (block_length < 0)
return -1;
415 if (fwrite(fp->compressed_block, 1, block_length, fp->fp) != block_length) {
416 fp->errcode |= BGZF_ERR_IO;
419 fp->block_address += block_length;
424int bgzf_flush_try(
BGZF *fp, ssize_t size)
426 if (fp->block_offset + size > BGZF_BLOCK_SIZE)
427 return bgzf_flush(fp);
431ssize_t bgzf_write(
BGZF *fp,
const void *data, ssize_t length)
433 const uint8_t *input = data;
434 int block_length = BGZF_BLOCK_SIZE, bytes_written;
435 assert(fp->open_mode ==
'w');
438 while (bytes_written < length) {
439 uint8_t* buffer = fp->uncompressed_block;
440 int copy_length = block_length - fp->block_offset < length - bytes_written? block_length - fp->block_offset : length - bytes_written;
441 memcpy(buffer + fp->block_offset, input, copy_length);
442 fp->block_offset += copy_length;
443 input += copy_length;
444 bytes_written += copy_length;
445 if (fp->block_offset == block_length && bgzf_flush(fp))
break;
447 return bytes_written;
450int bgzf_close(
BGZF* fp)
452 int ret, count, block_length;
453 if (fp == 0)
return -1;
454 if (fp->open_mode ==
'w') {
455 if (bgzf_flush(fp) != 0)
return -1;
456 block_length = deflate_block(fp, 0);
457 count = fwrite(fp->compressed_block, 1, block_length, fp->fp);
462 if (fflush(fp->fp) != 0) {
463 fp->errcode |= BGZF_ERR_IO;
467 ret = fp->open_mode ==
'w'? fclose(fp->fp) : _bgzf_close(fp->fp);
468 if (ret != 0)
return -1;
469 free(fp->uncompressed_block);
470 free(fp->compressed_block);
476void bgzf_set_cache_size(
BGZF *fp,
int cache_size)
478 if (fp) fp->cache_size = cache_size;
481int bgzf_check_EOF(
BGZF *fp)
483 static uint8_t magic[28] =
"\037\213\010\4\0\0\0\0\0\377\6\0\102\103\2\0\033\0\3\0\0\0\0\0\0\0\0\0";
486 static uint8_t magic2[28] =
"\4\0\0\0\0\0\377\6\0\102\103\2\0\036\0\1\0\0\377\377\0\0\0\0\0\0\0\0";
489 offset = _bgzf_tell((_bgzf_file_t)fp->fp);
490 if (_bgzf_seek(fp->fp, -28, SEEK_END) < 0)
return 0;
491 int count = _bgzf_read(fp->fp, buf, 28);
494 fp->errcode |= BGZF_ERR_IO;
497 _bgzf_seek(fp->fp, offset, SEEK_SET);
498 if((memcmp(magic, buf, 28) == 0) || (memcmp(magic2, buf, 28) == 0))
505int64_t bgzf_seek(
BGZF* fp, int64_t pos,
int where)
508 int64_t block_address;
510 if (fp->open_mode !=
'r' || where != SEEK_SET) {
511 fp->errcode |= BGZF_ERR_MISUSE;
514 block_offset = pos & 0xFFFF;
515 block_address = pos >> 16;
516 if (_bgzf_seek(fp->fp, block_address, SEEK_SET) < 0) {
517 fp->errcode |= BGZF_ERR_IO;
520 fp->block_length = 0;
521 fp->block_address = block_address;
522 fp->block_offset = block_offset;
526int bgzf_is_bgzf(
const char *fn)
531 if ((fp = _bgzf_open(fn,
"r")) == 0)
return 0;
532 n = _bgzf_read(fp, buf, 16);
534 if (n != 16)
return 0;
535 return memcmp(g_magic, buf, 16) == 0? 1 : 0;
538int bgzf_getc(
BGZF *fp)
541 if (fp->block_offset >= fp->block_length) {
542 if (bgzf_read_block(fp) != 0)
return -2;
543 if (fp->block_length == 0)
return -1;
545 c = ((
unsigned char*)fp->uncompressed_block)[fp->block_offset++];
546 if (fp->block_offset == fp->block_length) {
547 fp->block_address = _bgzf_tell((_bgzf_file_t)fp->fp);
548 fp->block_offset = 0;
549 fp->block_length = 0;
555#define kroundup32(x) (--(x), (x)|=(x)>>1, (x)|=(x)>>2, (x)|=(x)>>4, (x)|=(x)>>8, (x)|=(x)>>16, ++(x))
561 unsigned char *buf = (
unsigned char*)fp->uncompressed_block;
564 if (fp->block_offset >= fp->block_length) {
565 if (bgzf_read_block(fp) != 0) { state = -2;
break; }
566 if (fp->block_length == 0) { state = -1;
break; }
568 for (l = fp->block_offset; l < fp->block_length && buf[l] != delim; ++l);
569 if (l < fp->block_length) state = 1;
570 l -= fp->block_offset;
571 if (str->l + l + 1 >= str->m) {
572 str->m = str->l + l + 2;
574 str->s = (
char*)realloc(str->s, str->m);
576 memcpy(str->s + str->l, buf + fp->block_offset, l);
578 fp->block_offset += l + 1;
579 if (fp->block_offset >= fp->block_length) {
580 fp->block_address = _bgzf_tell((_bgzf_file_t)fp->fp);
581 fp->block_offset = 0;
582 fp->block_length = 0;
584 }
while (state == 0);
585 if (str->l == 0 && state < 0)
return state;