libStatGen Software 1
Loading...
Searching...
No Matches
GenomeSequenceHelpers.h
1/*
2 * Copyright (C) 2010 Regents of the University of Michigan
3 *
4 * This program is free software: you can redistribute it and/or modify
5 * it under the terms of the GNU General Public License as published by
6 * the Free Software Foundation, either version 3 of the License, or
7 * (at your option) any later version.
8 *
9 * This program is distributed in the hope that it will be useful,
10 * but WITHOUT ANY WARRANTY; without even the implied warranty of
11 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
12 * GNU General Public License for more details.
13 *
14 * You should have received a copy of the GNU General Public License
15 * along with this program. If not, see <http://www.gnu.org/licenses/>.
16 */
17
18#ifndef _GENOME_SEQUENCE_HELPERS_H
19#define _GENOME_SEQUENCE_HELPERS_H
20
21#if !defined(MD5_DIGEST_LENGTH)
22#define MD5_DIGEST_LENGTH 16
23#endif
24
25#include "MemoryMapArray.h"
26
27#include <stdint.h>
28
29
30//
31// ChromosomeInfo represents the per chromosome information
32// necessary to write out SAM/BAM records. In addition, it
33// contains a single internal index used to point to the vector
34// offset where the chromosome bases start.
35//
36// This is mildly non-optimal for larger collections of chromosomes
37// or contigs - one use case described having millions of contigs,
38// in which case, this structure alone would take a gigabyte or more.
39//
41{
42 static const int MAX_GENOME_INFO_STRING=128;
43
44 void constructorClear()
45 {
46 memset(this,0, sizeof(*this));
47 }
48 void setChromosomeName(const char *n)
49 {
50 strncpy(name, n, sizeof(name)-1);
51 name[sizeof(name)-1] = '\0';
52 }
53 genomeIndex_t start; // internal offset to combined genome vector
54 genomeIndex_t size; // SAM SQ:LN value
55 char md5[2*MD5_DIGEST_LENGTH + 1]; // 32 chars plus NUL, SAM SQ:M5 value
56 char name[MAX_GENOME_INFO_STRING]; // SAM SQ:SN value
57 char assemblyID[MAX_GENOME_INFO_STRING]; // SAM SQ:AS value
58 char uri[MAX_GENOME_INFO_STRING]; // SAM SQ:UR value
59 char species[MAX_GENOME_INFO_STRING]; // SAM SQ:SP value
60
61 // handy setting methods:
62 void setAssemblyID(const char *newID)
63 {
64 strncpy(assemblyID, newID, sizeof(assemblyID)-1);
65 name[sizeof(name)-1] = '\0';
66 }
67 void setSpecies(const char *newSpecies)
68 {
69 strncpy(species, newSpecies, sizeof(species)-1);
70 species[sizeof(species)-1] = '\0';
71 }
72 void setURI(const char *newURI)
73 {
74 strncpy(uri, newURI, sizeof(uri)-1);
75 uri[sizeof(uri)-1] = '\0';
76 }
77};
78
80{
81 friend class GenomeSequence;
82 friend std::ostream &operator << (std::ostream &, genomeSequenceMmapHeader &);
83private:
84 uint32_t _chromosomeCount;
85 bool _colorSpace;
86
87 ChromosomeInfo _chromosomes[0];
88
89public:
90 //
91 // getHeaderSize is special in that it must not access any
92 // member variables, since it is called when the header has
93 // not been created yet.
94 //
95 static size_t getHeaderSize(int chromosomeCount)
96 {
97 return sizeof(genomeSequenceMmapHeader) + sizeof(ChromosomeInfo[1]) * chromosomeCount;
98 }
99 //
100 // below methods return TRUE if it failed, false otherwise (primarily
101 // a length check).
102 //
103};
104
105std::ostream &operator << (std::ostream &stream, genomeSequenceMmapHeader &h);
106
107//
108// define the genomeSequence array type:
109//
110// NB the access/set routines use the encoded base values in the range
111// 0-15, not the corresponding base pair character.
112//
113inline uint8_t genomeSequenceAccess(void *base, genomeIndex_t index)
114{
115 if ((index&1)==0)
116 {
117 return ((uint8_t *) base)[index>>1] & 0xf;
118 }
119 else
120 {
121 return (((uint8_t *) base)[index>>1] & 0xf0) >> 4;
122 }
123};
124inline void genomeSequenceSet(void *base, genomeIndex_t index, uint8_t v)
125{
126 if ((index&1)==0)
127 {
128 ((uint8_t *) base)[index>>1] = (((uint8_t *) base)[index>>1] & 0xf0) | v;
129 }
130 else
131 {
132 ((uint8_t *) base)[index>>1] = (((uint8_t *) base)[index>>1] & 0x0f) | v<<4;
133 }
134}
135
136inline size_t mmapGenomeSequenceElementCount2Bytes(genomeIndex_t i)
137{
138 return sizeof(uint8_t) * i / 2;
139}
140
142{
143 void set(int index, int val)
144 {
145 packedBases[index>>1] =
146 (packedBases[index>>1] // original value
147 & ~(7<<((index&0x01)<<2))) // logical AND off the original value
148 | ((val&0x0f)<<((index&0x1)<<2)); // logical OR in the new value
149 }
150public:
151 std::vector<uint8_t> packedBases;
152 uint32_t length;
153 int size()
154 {
155 return length;
156 }
157 void clear()
158 {
159 packedBases.clear();
160 length=0;
161 }
162 void set(const char *rhs, int padWithNCount = 0);
163 uint8_t operator [](int index)
164 {
165 return (packedBases[index>>1] >> ((index&0x1)<<2)) & 0xf;
166 }
167};
168
169#endif
Create/Access/Modify/Load Genome Sequences stored as binary mapped files.