Skip to content

Commit 20bc4ff

Browse files
Serialization and deserialization to file, new serialization formats (#53)
Add the ability to serialize and deserialize a file directly with serializeFileAsync and deserializeFileAsync methods - those two are faster than loading a file in memory and deserialize/serialize in memory and are fully asynchronous. They use mmap if possible. Adds the ability to serialize to file text format and deserialize from buffer and from file text formats: newline_separated_values, comma_separated_values, tab_separated_values, json_array Adds the ability to serialize and deserialize a binary array of little endian uint32_t values remove COW enabled by default, it could cause issues with asynchronous operations and multi threading - will provide a flag in a new version
1 parent 1b476d8 commit 20bc4ff

23 files changed

+2843
-700
lines changed

.vscode/settings.json

+9-69
Original file line numberDiff line numberDiff line change
@@ -77,84 +77,24 @@
7777
},
7878
"files.associations": {
7979
"__bit_reference": "cpp",
80-
"__bits": "cpp",
81-
"__config": "cpp",
82-
"__debug": "cpp",
83-
"__errc": "cpp",
84-
"__hash_table": "cpp",
85-
"__locale": "cpp",
86-
"__mutex_base": "cpp",
8780
"__node_handle": "cpp",
88-
"__nullptr": "cpp",
89-
"__split_buffer": "cpp",
90-
"__string": "cpp",
91-
"__threading_support": "cpp",
92-
"__tree": "cpp",
93-
"__tuple": "cpp",
94-
"array": "cpp",
95-
"atomic": "cpp",
9681
"bitset": "cpp",
97-
"cctype": "cpp",
98-
"chrono": "cpp",
99-
"cinttypes": "cpp",
100-
"clocale": "cpp",
101-
"cmath": "cpp",
102-
"compare": "cpp",
103-
"complex": "cpp",
104-
"concepts": "cpp",
105-
"cstdarg": "cpp",
106-
"cstddef": "cpp",
107-
"cstdint": "cpp",
108-
"cstdio": "cpp",
109-
"cstdlib": "cpp",
110-
"cstring": "cpp",
111-
"ctime": "cpp",
112-
"cwchar": "cpp",
113-
"cwctype": "cpp",
11482
"deque": "cpp",
115-
"exception": "cpp",
116-
"fstream": "cpp",
117-
"initializer_list": "cpp",
118-
"iomanip": "cpp",
119-
"ios": "cpp",
120-
"iosfwd": "cpp",
121-
"iostream": "cpp",
122-
"istream": "cpp",
83+
"__memory": "cpp",
12384
"limits": "cpp",
124-
"locale": "cpp",
125-
"map": "cpp",
126-
"memory": "cpp",
127-
"mutex": "cpp",
128-
"new": "cpp",
129-
"numeric": "cpp",
13085
"optional": "cpp",
131-
"ostream": "cpp",
132-
"queue": "cpp",
133-
"random": "cpp",
13486
"ratio": "cpp",
135-
"set": "cpp",
136-
"sstream": "cpp",
137-
"stack": "cpp",
138-
"stdexcept": "cpp",
139-
"streambuf": "cpp",
140-
"string": "cpp",
141-
"string_view": "cpp",
14287
"system_error": "cpp",
14388
"tuple": "cpp",
14489
"type_traits": "cpp",
145-
"typeinfo": "cpp",
146-
"unordered_map": "cpp",
147-
"variant": "cpp",
14890
"vector": "cpp",
149-
"algorithm": "cpp",
150-
"roaring.c": "cpp",
151-
"isadetection.c": "cpp",
152-
"array_util.c": "cpp",
153-
"bitset.c": "cpp",
154-
"bitset_util.c": "cpp",
155-
"containers.c": "cpp",
156-
"__verbose_abort": "cpp",
157-
"bit": "cpp",
158-
"filesystem": "cpp"
91+
"chrono": "cpp",
92+
"filesystem": "cpp",
93+
"random": "cpp",
94+
"__config": "cpp",
95+
"__nullptr": "cpp",
96+
"atomic": "cpp",
97+
"locale": "cpp",
98+
"cstddef": "cpp"
15999
}
160100
}

index.d.ts

+165-10
Original file line numberDiff line numberDiff line change
@@ -157,15 +157,83 @@ export enum SerializationFormat {
157157
*
158158
*/
159159
unsafe_frozen_croaring = "unsafe_frozen_croaring",
160+
161+
/**
162+
* A plain binary array of 32 bits integers in little endian format. 4 bytes per value.
163+
*/
164+
uint32_array = "uint32_array",
165+
}
166+
167+
export enum FileSerializationFormat {
168+
/**
169+
* Stable Optimized non portable C/C++ format. Used by croaring. Can be smaller than the portable format.
170+
*/
171+
croaring = "croaring",
172+
173+
/**
174+
* Stable Portable Java and Go format.
175+
*/
176+
portable = "unsafe_portable",
177+
178+
/**
179+
* A plain binary array of 32 bits integers in little endian format. 4 bytes per value.
180+
*/
181+
uint32_array = "uint32_array",
182+
183+
/**
184+
* Non portable C/C++ frozen format.
185+
* Is considered unsafe and unstable because the format might change at any new version.
186+
* Can be useful for temporary storage or for sending data over the network between similar machines.
187+
* If the content is corrupted when deserialized or when a frozen view is create, the behavior is undefined!
188+
* The application may crash, buffer overrun, could be a vector of attack!
189+
*
190+
* When this option is used in the serialize function, the new returned buffer (if no buffer was provided) will be aligned to a 32 bytes boundary.
191+
* This is required to create a frozen view with the method unsafeFrozenView.
192+
*
193+
*/
194+
unsafe_frozen_croaring = "unsafe_frozen_croaring",
195+
196+
/**
197+
* Comma separated values, all values are in decimal and in one line without spaces or other characters.
198+
*/
199+
comma_separated_values = "comma_separated_values",
200+
201+
/**
202+
* Tab "\t" separated values, all values are in decimal and in one line without other characters.
203+
*/
204+
tab_separated_values = "tab_separated_values",
205+
206+
/**
207+
* Newline (\n) separated values, all values are in decimal and one per line with a terminating newline.
208+
*/
209+
newline_separated_values = "newline_separated_values",
210+
211+
/**
212+
* A JSON file in the format "[1,2,3,4...]"
213+
*/
214+
json_array = "json_array",
160215
}
161216

162217
export type SerializationFormatType =
163218
| SerializationFormat
164219
| "croaring"
165220
| "portable"
166221
| "unsafe_frozen_croaring"
222+
| "uint32_array"
167223
| boolean;
168224

225+
export type FileSerializationFormatType =
226+
| SerializationFormatType
227+
| FileSerializationFormat
228+
| "comma_separated_values"
229+
| "tab_separated_values"
230+
| "newline_separated_values"
231+
| "json_array";
232+
233+
export type SerializationDeserializationFormatType = SerializationFormatType & DeserializationFormatType;
234+
235+
export type FileSerializationDeserializationFormatType = FileSerializationFormatType & FileDeserializationFormatType;
236+
169237
export enum DeserializationFormat {
170238
/** Stable Optimized non portable C/C++ format. Used by croaring. Can be smaller than the portable format. */
171239
croaring = "croaring",
@@ -190,16 +258,69 @@ export enum DeserializationFormat {
190258
* The application may crash, buffer overrun, could be a vector of attack!
191259
*/
192260
unsafe_frozen_portable = "unsafe_frozen_portable",
261+
262+
/**
263+
* A plain binary array of 32 bits integers in little endian format. 4 bytes per value.
264+
*/
265+
uint32_array = "uint32_array",
266+
267+
comma_separated_values = "comma_separated_values",
268+
tab_separated_values = "tab_separated_values",
269+
newline_separated_values = "newline_separated_values",
270+
json_array = "json_array",
193271
}
194272

195273
export type DeserializationFormatType =
196-
| SerializationFormat
274+
| DeserializationFormat
197275
| "croaring"
198276
| "portable"
199277
| "unsafe_frozen_croaring"
200278
| "unsafe_frozen_portable"
279+
| "uint32_array"
280+
| "comma_separated_values"
281+
| "tab_separated_values"
282+
| "newline_separated_values"
283+
| "json_array"
201284
| boolean;
202285

286+
export enum FileDeserializationFormat {
287+
/** Stable Optimized non portable C/C++ format. Used by croaring. Can be smaller than the portable format. */
288+
croaring = "croaring",
289+
290+
/** Stable Portable Java and Go format. */
291+
portable = "portable",
292+
293+
/**
294+
* Non portable C/C++ frozen format.
295+
* Is considered unsafe and unstable because the format might change at any new version.
296+
* Can be useful for temporary storage or for sending data over the network between similar machines.
297+
* If the content is corrupted when loaded or the buffer is modified when a frozen view is create, the behavior is undefined!
298+
* The application may crash, buffer overrun, could be a vector of attack!
299+
*/
300+
unsafe_frozen_croaring = "unsafe_frozen_croaring",
301+
302+
/**
303+
* Portable version of the frozen view, compatible with Go and Java.
304+
* Is considered unsafe and unstable because the format might change at any new version.
305+
* Can be useful for temporary storage or for sending data over the network between similar machines.
306+
* If the content is corrupted when loaded or the buffer is modified when a frozen view is create, the behavior is undefined!
307+
* The application may crash, buffer overrun, could be a vector of attack!
308+
*/
309+
unsafe_frozen_portable = "unsafe_frozen_portable",
310+
311+
/**
312+
* A plain binary array of 32 bits integers in little endian format. 4 bytes per value.
313+
*/
314+
uint32_array = "uint32_array",
315+
316+
comma_separated_values = "comma_separated_values",
317+
tab_separated_values = "tab_separated_values",
318+
newline_separated_values = "newline_separated_values",
319+
json_array = "json_array",
320+
}
321+
322+
export type FileDeserializationFormatType = DeserializationFormatType | FileDeserializationFormat;
323+
203324
export enum FrozenViewFormat {
204325
/**
205326
* Non portable C/C++ frozen format.
@@ -1006,6 +1127,18 @@ export interface ReadonlyRoaringBitmap32 extends ReadonlySet<number> {
10061127
format: SerializationFormatType,
10071128
): Promise<Buffer>;
10081129

1130+
/**
1131+
* Serializes the bitmap into a file, asynchronously.
1132+
* The bitmap will be temporarily frozen until the operation completes.
1133+
*
1134+
* This is faster, everything runs in its own thread and it consumes less memory than serializing to a Buffer and then to write to a file,
1135+
* internally it uses memory mapped files and skip all the JS overhead.
1136+
*
1137+
* @param {FileSerializationFormat | boolean} format One of the SerializationFormat enum values, or a boolean value: if false, optimized C/C++ format is used. If true, Java and Go portable format is used.
1138+
* @memberof ReadonlyRoaringBitmap32
1139+
*/
1140+
serializeFileAsync(filePath: string, format: FileSerializationFormatType): Promise<void>;
1141+
10091142
/**
10101143
* Returns a new ReadonlyRoaringBitmap32 that is a copy of this bitmap, same as new ReadonlyRoaringBitmap32(copy)
10111144
*
@@ -1174,7 +1307,6 @@ export interface RoaringBitmap32 extends ReadonlyRoaringBitmap32, Set<number> {
11741307
* Overwrite the content of this bitmap copying it from an Iterable or another RoaringBitmap32.
11751308
*
11761309
* Is faster to pass a Uint32Array instance instead of an array or an iterable.
1177-
*
11781310
* Is even faster if a RoaringBitmap32 instance is used (it performs a simple copy).
11791311
*
11801312
* @param {Iterable<number>} values The new values or a RoaringBitmap32 instance.
@@ -1476,9 +1608,17 @@ export class RoaringBitmap32 {
14761608

14771609
public readonly SerializationFormat: typeof SerializationFormat;
14781610

1479-
public static readonly DeserializationFormat: typeof SerializationFormat;
1611+
public static readonly FileSerializationFormat: typeof FileSerializationFormat;
1612+
1613+
public readonly FileSerializationFormat: typeof FileSerializationFormat;
1614+
1615+
public static readonly FileDeserializationFormat: typeof FileDeserializationFormat;
14801616

1481-
public readonly DeserializationFormat: typeof SerializationFormat;
1617+
public readonly FileDeserializationFormat: typeof FileDeserializationFormat;
1618+
1619+
public static readonly DeserializationFormat: typeof DeserializationFormat;
1620+
1621+
public readonly DeserializationFormat: typeof DeserializationFormat;
14821622

14831623
public static readonly FrozenViewFormat: typeof FrozenViewFormat;
14841624

@@ -1735,12 +1875,10 @@ export class RoaringBitmap32 {
17351875
*
17361876
* Returns a Promise that resolves to a new RoaringBitmap32 instance.
17371877
*
1738-
* Setting the portable flag to false enable a custom format that can save space compared to the portable format (e.g., for very sparse bitmaps).
17391878
* The portable version is meant to be compatible with Java and Go versions.
1879+
* The croaring version is compatible with the C version, it can be smaller than the portable version.
17401880
* When a frozen format is used, the buffer will be copied and the bitmap will be frozen.
17411881
*
1742-
* NOTE: portable argument was optional before, now is required and an Error is thrown if the portable flag is not passed.
1743-
*
17441882
* @static
17451883
* @param {Uint8Array | Uint8ClampedArray | Int8Array | ArrayBuffer| SharedArrayBuffer | null | undefined} serialized An Uint8Array or a node Buffer that contains the serialized data.
17461884
* @param {DeserializationFormatType} format The format of the serialized data. true means "portable". false means "croaring".
@@ -1758,12 +1896,10 @@ export class RoaringBitmap32 {
17581896
*
17591897
* When deserialization is completed or failed, the given callback will be executed.
17601898
*
1761-
* Setting the portable flag to false enable a custom format that can save space compared to the portable format (e.g., for very sparse bitmaps).
17621899
* The portable version is meant to be compatible with Java and Go versions.
1900+
* The croaring version is compatible with the C version, it can be smaller than the portable version.
17631901
* When a frozen format is used, the buffer will be copied and the bitmap will be frozen.
17641902
*
1765-
* NOTE: portable argument was optional before, now is required and an Error is thrown if the portable flag is not passed.
1766-
*
17671903
* @static
17681904
* @param {Uint8Array | Uint8ClampedArray | Int8Array | ArrayBuffer| SharedArrayBuffer | null | undefined} serialized An Uint8Array or a node Buffer that contains the.
17691905
* @param {DeserializationFormatType} format The format of the serialized data. true means "portable". false means "croaring".
@@ -1777,6 +1913,25 @@ export class RoaringBitmap32 {
17771913
callback: RoaringBitmap32Callback,
17781914
): void;
17791915

1916+
/**
1917+
* Deserializes the bitmap from a file asynchronously.
1918+
* Returns a new RoaringBitmap32 instance.
1919+
*
1920+
* The portable version is meant to be compatible with Java and Go versions.
1921+
* The croaring version is compatible with the C version, it can be smaller than the portable version.
1922+
* When a frozen format is used, the buffer will be copied and the bitmap will be frozen.
1923+
*
1924+
* This is faster, everything runs in its own thread and it consumes less memory than serializing to a Buffer and then to write to a file,
1925+
* internally it uses memory mapped files and skip all the JS overhead.
1926+
*
1927+
* @static
1928+
* @param {string} filePath The path of the file to read.
1929+
* @param {FileDeserializationFormatType} format The format of the serialized data. true means "portable". false means "croaring".
1930+
* @returns {Promise<RoaringBitmap32>} A promise that resolves to a new RoaringBitmap32 instance.
1931+
* @memberof RoaringBitmap32
1932+
*/
1933+
public static deserializeFileAsync(filePath: string, format: FileDeserializationFormatType): Promise<RoaringBitmap32>;
1934+
17801935
/**
17811936
*
17821937
* Deserializes many bitmaps from an array of Uint8Array or an array of Buffer asynchronously in multiple parallel threads.

0 commit comments

Comments
 (0)