Line data Source code
1 : /*
2 : Copyright 2016-2021 Esri
3 : Licensed under the Apache License, Version 2.0 (the "License");
4 : you may not use this file except in compliance with the License.
5 : You may obtain a copy of the License at
6 : http://www.apache.org/licenses/LICENSE-2.0
7 : Unless required by applicable law or agreed to in writing, software
8 : distributed under the License is distributed on an "AS IS" BASIS,
9 : WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
10 : See the License for the specific language governing permissions and
11 : limitations under the License.
12 :
13 : Contributors: Lucian Plesea
14 : */
15 :
16 : //
17 : // Implements an RLE codec packer. The RLE uses a dedicated marker code
18 : // This particular packer picks the least used byte value as marker,
19 : // which makes the worst case data expansion more reasonable, at the expense
20 : // of taking two passes over the input data
21 : //
22 :
23 : // For memset
24 : #include <cstring>
25 : #include <vector>
26 : #include <algorithm>
27 :
28 : #include "marfa.h"
29 : #include "Packer_RLE.h"
30 :
31 : NAMESPACE_MRF_START
32 : //
33 : // RLE yarn codec, uses a dedicated code as marker, default value is 0xC3
34 : // This is the C implementation, there is also a C++ one
35 : // For yarnball compression, it performs better than using a double char marker
36 : //
37 : // Includes both encoder and decoder
38 : // Worst case input:output ratio is 1:2, when the input is 1,2 or three marker
39 : // codes For long input streams, the input:output ratio is between 13260.6:1
40 : // (best) and 1:1.75 (worst)
41 : //
42 : // Could be a byte stream filter which needs very little local storage
43 : //
44 :
45 : constexpr int MAX_RUN = 768 + 0xffff;
46 : typedef unsigned char Byte;
47 :
48 : #define UC(X) static_cast<Byte>(X)
49 :
50 : // Encode helper function
51 : // It returns how many times the byte at *s is repeated
52 : // a value between 1 and min(max_count, MAX_RUN)
53 1340 : inline static int run_length(const Byte *s, int max_count)
54 : {
55 1340 : if (max_count > MAX_RUN)
56 0 : max_count = MAX_RUN;
57 1340 : const Byte c = *s++;
58 393387 : for (int count = 1; count < max_count; count++)
59 393375 : if (c != *s++)
60 1328 : return count;
61 12 : return max_count;
62 : }
63 :
64 : #define RET_NOW \
65 : do \
66 : { \
67 : return static_cast<size_t>(next - reinterpret_cast<Byte *>(obuf)); \
68 : } while (0)
69 :
70 : //
71 : // C compress function, returns compressed size
72 : // len is the size of the input buffer
73 : // caller should ensure that output buffer is at least 2 * N to be safe,
74 : // dropping to N * 7/4 for larger input
75 : // If the Code is chosen to be the least present value in input, the
76 : // output size requirement is bound by N / 256 + N
77 : //
78 12 : static size_t toYarn(const char *ibuffer, char *obuf, size_t len,
79 : Byte CODE = 0xC3)
80 : {
81 12 : Byte *next = reinterpret_cast<Byte *>(obuf);
82 :
83 1352 : while (len)
84 : {
85 1340 : Byte b = static_cast<Byte>(*ibuffer);
86 1340 : int run = run_length(reinterpret_cast<const Byte *>(ibuffer),
87 : static_cast<int>(len));
88 1340 : if (run < 4)
89 : { // Encoded as single bytes, stored as such, CODE followed by a zero
90 482 : run = 1;
91 482 : *next++ = b;
92 482 : if (CODE == b)
93 0 : *next++ = 0;
94 : }
95 : else
96 : { // Encoded as a sequence
97 858 : *next++ = CODE; // Start with Marker code, always present
98 :
99 858 : if (run > 767)
100 : { // long sequence
101 12 : ibuffer += 768; // May be unsafe to read *ibuffer
102 12 : len -= 768;
103 12 : run -= 768;
104 12 : *next++ = 3;
105 12 : *next++ = UC(run >> 8); // Forced high count
106 : }
107 846 : else if (run > 255)
108 : { // Between 256 and 767, medium sequence
109 292 : *next++ = UC(run >> 8); // High count
110 : }
111 :
112 858 : *next++ = UC(run & 0xff); // Low count, always present
113 858 : *next++ = b; // End with Value, always present
114 : }
115 1340 : ibuffer += run;
116 1340 : len -= run;
117 : }
118 12 : RET_NOW;
119 : }
120 :
121 : // Check that another input byte can be read, return now otherwise
122 : // Adjusts the input length too
123 : #define CHECK_INPUT \
124 : if (ilen == 0) \
125 : RET_NOW
126 : // Reads a byte and adjust the input pointer
127 : #define NEXT_BYTE UC(*ibuffer++)
128 :
129 : //
130 : // C decompress function, returns actual decompressed size
131 : // Stops when either olen is reached or when ilen is exhausted
132 : // returns the number of output bytes written
133 : //
134 8 : static size_t fromYarn(const char *ibuffer, size_t ilen, char *obuf,
135 : size_t olen, Byte CODE = 0xC3)
136 : {
137 8 : Byte *next = reinterpret_cast<Byte *>(obuf);
138 799 : while (ilen > 0 && olen > 0)
139 : {
140 : // It is safe to read and write one byte
141 791 : Byte b = NEXT_BYTE;
142 791 : ilen--;
143 791 : if (b != CODE)
144 : { // Copy single chars
145 314 : *next++ = b;
146 314 : olen--;
147 : }
148 : else
149 : { // Marker found, which type of sequence is it?
150 477 : CHECK_INPUT;
151 477 : b = NEXT_BYTE;
152 477 : ilen--;
153 477 : if (b == 0)
154 : { // Emit one code
155 0 : *next++ = CODE;
156 0 : olen--;
157 : }
158 : else
159 : { // Sequence
160 477 : size_t run = 0;
161 477 : if (b < 4)
162 : {
163 164 : run = 256 * b;
164 164 : if (3 == b)
165 : { // Second byte of high count
166 8 : CHECK_INPUT;
167 8 : run += 256 * NEXT_BYTE;
168 8 : ilen--;
169 : }
170 164 : CHECK_INPUT;
171 164 : run += NEXT_BYTE;
172 164 : ilen--;
173 : }
174 : else
175 : { // Single byte count
176 313 : run = b;
177 : }
178 :
179 : // Write the sequence out, after checking
180 477 : if (olen < run)
181 0 : RET_NOW;
182 477 : CHECK_INPUT;
183 477 : b = NEXT_BYTE;
184 477 : ilen--;
185 477 : memset(next, b, run);
186 :
187 477 : next += run;
188 477 : olen -= run;
189 : }
190 : }
191 : }
192 8 : RET_NOW;
193 : }
194 :
195 : // Returns the least used byte value from a buffer
196 12 : static Byte getLeastUsed(const Byte *src, size_t len)
197 : {
198 12 : std::vector<unsigned int> hist(256, 0);
199 393228 : while (len)
200 : {
201 393216 : --len;
202 393216 : hist[*src++]++;
203 : }
204 : const size_t nIdxMin =
205 12 : std::min_element(hist.begin(), hist.end()) - hist.begin();
206 24 : return UC(nIdxMin);
207 : }
208 :
209 : // Read from a packed source until the src is exhausted
210 : // Returns true if all output buffer was filled, 0 otherwise
211 8 : int RLEC3Packer::load(storage_manager *src, storage_manager *dst)
212 : {
213 : // Use the first char in the input buffer as marker code
214 16 : return dst->size == fromYarn(src->buffer + 1, src->size - 1, dst->buffer,
215 8 : dst->size, *src->buffer);
216 : }
217 :
218 : //
219 : // Picks the least use value as the marker code and stores it first in the data
220 : // This choice improves the worst case expansion, which becomes (1 + N / 256 +
221 : // N) : N It also improves compression in general Makes best case marginally
222 : // worse because the chosen code adds one byte to the output
223 : //
224 :
225 12 : int RLEC3Packer::store(storage_manager *src, storage_manager *dst)
226 : {
227 12 : size_t N = src->size;
228 12 : if (dst->size < 1 + N + N / 256)
229 0 : return 0; // Failed, destination might overflow
230 : Byte c =
231 12 : getLeastUsed(reinterpret_cast<const Byte *>(src->buffer), src->size);
232 12 : *dst->buffer++ = static_cast<char>(c);
233 12 : dst->size = 1 + toYarn(src->buffer, dst->buffer, src->size, c);
234 12 : return 1; // Success, size is in dst->size
235 : }
236 :
237 : NAMESPACE_MRF_END
|