Line data Source code
1 : /*
2 : Copyright 2016-2021 Esri
3 : Licensed under the Apache License, Version 2.0 (the "License");
4 : you may not use this file except in compliance with the License.
5 : You may obtain a copy of the License at
6 : http://www.apache.org/licenses/LICENSE-2.0
7 : Unless required by applicable law or agreed to in writing, software
8 : distributed under the License is distributed on an "AS IS" BASIS,
9 : WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
10 : See the License for the specific language governing permissions and
11 : limitations under the License.
12 :
13 : Contributors: Lucian Plesea
14 : */
15 :
16 : //
17 : // Implements an RLE codec packer. The RLE uses a dedicated marker code
18 : // This particular packer picks the least used byte value as marker,
19 : // which makes the worst case data expansion more reasonable, at the expense
20 : // of taking two passes over the input data
21 : //
22 :
23 : // For memset
24 : #include <cstring>
25 : #include <vector>
26 : #include <algorithm>
27 :
28 : #include "marfa.h"
29 : #include "Packer_RLE.h"
30 :
31 : NAMESPACE_MRF_START
32 :
33 : Packer::~Packer() = default;
34 :
35 : //
36 : // RLE yarn codec, uses a dedicated code as marker, default value is 0xC3
37 : // This is the C implementation, there is also a C++ one
38 : // For yarnball compression, it performs better than using a double char marker
39 : //
40 : // Includes both encoder and decoder
41 : // Worst case input:output ratio is 1:2, when the input is 1,2 or three marker
42 : // codes For long input streams, the input:output ratio is between 13260.6:1
43 : // (best) and 1:1.75 (worst)
44 : //
45 : // Could be a byte stream filter which needs very little local storage
46 : //
47 :
48 : constexpr int MAX_RUN = 768 + 0xffff;
49 : typedef unsigned char Byte;
50 :
51 : #define UC(X) static_cast<Byte>(X)
52 :
53 : // Encode helper function
54 : // It returns how many times the byte at *s is repeated
55 : // a value between 1 and min(max_count, MAX_RUN)
56 1340 : inline static int run_length(const Byte *s, int max_count)
57 : {
58 1340 : if (max_count > MAX_RUN)
59 0 : max_count = MAX_RUN;
60 1340 : const Byte c = *s++;
61 393387 : for (int count = 1; count < max_count; count++)
62 393375 : if (c != *s++)
63 1328 : return count;
64 12 : return max_count;
65 : }
66 :
67 : #define RET_NOW \
68 : do \
69 : { \
70 : return static_cast<size_t>(next - reinterpret_cast<Byte *>(obuf)); \
71 : } while (0)
72 :
73 : //
74 : // C compress function, returns compressed size
75 : // len is the size of the input buffer
76 : // caller should ensure that output buffer is at least 2 * N to be safe,
77 : // dropping to N * 7/4 for larger input
78 : // If the Code is chosen to be the least present value in input, the
79 : // output size requirement is bound by N / 256 + N
80 : //
81 12 : static size_t toYarn(const char *ibuffer, char *obuf, size_t len,
82 : Byte CODE = 0xC3)
83 : {
84 12 : Byte *next = reinterpret_cast<Byte *>(obuf);
85 :
86 1352 : while (len)
87 : {
88 1340 : Byte b = static_cast<Byte>(*ibuffer);
89 1340 : int run = run_length(reinterpret_cast<const Byte *>(ibuffer),
90 : static_cast<int>(len));
91 1340 : if (run < 4)
92 : { // Encoded as single bytes, stored as such, CODE followed by a zero
93 482 : run = 1;
94 482 : *next++ = b;
95 482 : if (CODE == b)
96 0 : *next++ = 0;
97 : }
98 : else
99 : { // Encoded as a sequence
100 858 : *next++ = CODE; // Start with Marker code, always present
101 :
102 858 : if (run > 767)
103 : { // long sequence
104 12 : ibuffer += 768; // May be unsafe to read *ibuffer
105 12 : len -= 768;
106 12 : run -= 768;
107 12 : *next++ = 3;
108 12 : *next++ = UC(run >> 8); // Forced high count
109 : }
110 846 : else if (run > 255)
111 : { // Between 256 and 767, medium sequence
112 292 : *next++ = UC(run >> 8); // High count
113 : }
114 :
115 858 : *next++ = UC(run & 0xff); // Low count, always present
116 858 : *next++ = b; // End with Value, always present
117 : }
118 1340 : ibuffer += run;
119 1340 : len -= run;
120 : }
121 12 : RET_NOW;
122 : }
123 :
124 : // Check that another input byte can be read, return now otherwise
125 : // Adjusts the input length too
126 : #define CHECK_INPUT \
127 : if (ilen == 0) \
128 : RET_NOW
129 : // Reads a byte and adjust the input pointer
130 : #define NEXT_BYTE UC(*ibuffer++)
131 :
132 : //
133 : // C decompress function, returns actual decompressed size
134 : // Stops when either olen is reached or when ilen is exhausted
135 : // returns the number of output bytes written
136 : //
137 8 : static size_t fromYarn(const char *ibuffer, size_t ilen, char *obuf,
138 : size_t olen, Byte CODE = 0xC3)
139 : {
140 8 : Byte *next = reinterpret_cast<Byte *>(obuf);
141 799 : while (ilen > 0 && olen > 0)
142 : {
143 : // It is safe to read and write one byte
144 791 : Byte b = NEXT_BYTE;
145 791 : ilen--;
146 791 : if (b != CODE)
147 : { // Copy single chars
148 314 : *next++ = b;
149 314 : olen--;
150 : }
151 : else
152 : { // Marker found, which type of sequence is it?
153 477 : CHECK_INPUT;
154 477 : b = NEXT_BYTE;
155 477 : ilen--;
156 477 : if (b == 0)
157 : { // Emit one code
158 0 : *next++ = CODE;
159 0 : olen--;
160 : }
161 : else
162 : { // Sequence
163 477 : size_t run = 0;
164 477 : if (b < 4)
165 : {
166 164 : run = 256 * b;
167 164 : if (3 == b)
168 : { // Second byte of high count
169 8 : CHECK_INPUT;
170 8 : run += 256 * NEXT_BYTE;
171 8 : ilen--;
172 : }
173 164 : CHECK_INPUT;
174 164 : run += NEXT_BYTE;
175 164 : ilen--;
176 : }
177 : else
178 : { // Single byte count
179 313 : run = b;
180 : }
181 :
182 : // Write the sequence out, after checking
183 477 : if (olen < run)
184 0 : RET_NOW;
185 477 : CHECK_INPUT;
186 477 : b = NEXT_BYTE;
187 477 : ilen--;
188 477 : memset(next, b, run);
189 :
190 477 : next += run;
191 477 : olen -= run;
192 : }
193 : }
194 : }
195 8 : RET_NOW;
196 : }
197 :
198 : // Returns the least used byte value from a buffer
199 12 : static Byte getLeastUsed(const Byte *src, size_t len)
200 : {
201 12 : std::vector<unsigned int> hist(256, 0);
202 393228 : while (len)
203 : {
204 393216 : --len;
205 393216 : hist[*src++]++;
206 : }
207 : const size_t nIdxMin =
208 12 : std::min_element(hist.begin(), hist.end()) - hist.begin();
209 24 : return UC(nIdxMin);
210 : }
211 :
212 : // Read from a packed source until the src is exhausted
213 : // Returns true if all output buffer was filled, 0 otherwise
214 8 : int RLEC3Packer::load(storage_manager *src, storage_manager *dst)
215 : {
216 : // Use the first char in the input buffer as marker code
217 16 : return dst->size == fromYarn(src->buffer + 1, src->size - 1, dst->buffer,
218 8 : dst->size, *src->buffer);
219 : }
220 :
221 : //
222 : // Picks the least use value as the marker code and stores it first in the data
223 : // This choice improves the worst case expansion, which becomes (1 + N / 256 +
224 : // N) : N It also improves compression in general Makes best case marginally
225 : // worse because the chosen code adds one byte to the output
226 : //
227 :
228 12 : int RLEC3Packer::store(storage_manager *src, storage_manager *dst)
229 : {
230 12 : size_t N = src->size;
231 12 : if (dst->size < 1 + N + N / 256)
232 0 : return 0; // Failed, destination might overflow
233 : Byte c =
234 12 : getLeastUsed(reinterpret_cast<const Byte *>(src->buffer), src->size);
235 12 : *dst->buffer++ = static_cast<char>(c);
236 12 : dst->size = 1 + toYarn(src->buffer, dst->buffer, src->size, c);
237 12 : return 1; // Success, size is in dst->size
238 : }
239 :
240 : NAMESPACE_MRF_END
|