zip.zig 19 KB


  1. /// The .ZIP File Format Specification is found here:
  2. /// https://pkwaredownloads.blob.core.windows.net/pem/APPNOTE.txt
  3. const std = @import("std");
  4. const testing = std.testing;
  5. pub const File = @import("zip/test.zig").File;
  6. pub const FileCache = @import("zip/test.zig").FileCache;
  7. pub const writeFile = @import("zip/test.zig").writeFile;
  8. pub const CompressionMethod = enum(u16) {
  9. store = 0,
  10. deflate = 8,
  11. deflate64 = 9,
  12. _,
  13. };
  14. pub const central_file_header_sig = [4]u8{ 'P', 'K', 1, 2 };
  15. pub const local_file_header_sig = [4]u8{ 'P', 'K', 3, 4 };
  16. pub const end_of_central_directory_sig = [4]u8{ 'P', 'K', 5, 6 };
  17. pub const LocalFileHeader = struct {
  18. signature: [4]u8,
  19. minimum_version: u16,
  20. flags: u16,
  21. compression_method: CompressionMethod,
  22. last_modification_time: u16,
  23. last_modification_date: u16,
  24. crc32: u32,
  25. compressed_size: u32,
  26. uncompressed_size: u32,
  27. filename_len: u16,
  28. extra_len: u16,
  29. pub fn deserialize(bytes: [30]u8) LocalFileHeader {
  30. return .{
  31. .signature = bytes[0..4].*,
  32. .minimum_version = std.mem.readInt(u16, bytes[4..6], .little),
  33. .flags = std.mem.readInt(u16, bytes[6..8], .little),
  34. .compression_method = @enumFromInt(std.mem.readInt(u16, bytes[8..10], .little)),
  35. .last_modification_time = std.mem.readInt(u16, bytes[10..12], .little),
  36. .last_modification_date = std.mem.readInt(u16, bytes[12..14], .little),
  37. .crc32 = std.mem.readInt(u32, bytes[14..18], .little),
  38. .compressed_size = std.mem.readInt(u32, bytes[18..22], .little),
  39. .uncompressed_size = std.mem.readInt(u32, bytes[22..26], .little),
  40. .filename_len = std.mem.readInt(u16, bytes[26..28], .little),
  41. .extra_len = std.mem.readInt(u16, bytes[28..30], .little),
  42. };
  43. }
  44. pub fn serialize(self: LocalFileHeader) [30]u8 {
  45. var result: [30]u8 = undefined;
  46. result[0..4].* = self.signature;
  47. std.mem.writeInt(u16, result[4..6], self.minimum_version, .little);
  48. std.mem.writeInt(u16, result[6..8], self.flags, .little);
  49. std.mem.writeInt(u16, result[8..10], @intFromEnum(self.compression_method), .little);
  50. std.mem.writeInt(u16, result[10..12], self.last_modification_time, .little);
  51. std.mem.writeInt(u16, result[12..14], self.last_modification_date, .little);
  52. std.mem.writeInt(u32, result[14..18], self.crc32, .little);
  53. std.mem.writeInt(u32, result[18..22], self.compressed_size, .little);
  54. std.mem.writeInt(u32, result[22..26], self.uncompressed_size, .little);
  55. std.mem.writeInt(u16, result[26..28], self.filename_len, .little);
  56. std.mem.writeInt(u16, result[28..30], self.extra_len, .little);
  57. return result;
  58. }
  59. };
  60. pub const CentralDirectoryFileHeader = struct {
  61. signature: [4]u8,
  62. version: u16,
  63. minimum_version: u16,
  64. flags: u16,
  65. compression_method: CompressionMethod,
  66. last_modification_time: u16,
  67. last_modification_date: u16,
  68. crc32: u32,
  69. compressed_size: u32,
  70. uncompressed_size: u32,
  71. filename_len: u16,
  72. extra_len: u16,
  73. comment_len: u16,
  74. disk_number: u16,
  75. internal_file_attributes: u16,
  76. external_file_attributes: u32,
  77. local_file_header_offset: u32,
  78. pub fn deserialize(bytes: [46]u8) CentralDirectoryFileHeader {
  79. return .{
  80. .signature = bytes[0..4].*,
  81. .version = std.mem.readInt(u16, bytes[4..6], .little),
  82. .minimum_version = std.mem.readInt(u16, bytes[6..8], .little),
  83. .flags = std.mem.readInt(u16, bytes[8..10], .little),
  84. .compression_method = @enumFromInt(std.mem.readInt(u16, bytes[10..12], .little)),
  85. .last_modification_time = std.mem.readInt(u16, bytes[12..14], .little),
  86. .last_modification_date = std.mem.readInt(u16, bytes[14..16], .little),
  87. .crc32 = std.mem.readInt(u32, bytes[16..20], .little),
  88. .compressed_size = std.mem.readInt(u32, bytes[20..24], .little),
  89. .uncompressed_size = std.mem.readInt(u32, bytes[24..28], .little),
  90. .filename_len = std.mem.readInt(u16, bytes[28..30], .little),
  91. .extra_len = std.mem.readInt(u16, bytes[30..32], .little),
  92. .comment_len = std.mem.readInt(u16, bytes[32..34], .little),
  93. .disk_number = std.mem.readInt(u16, bytes[34..36], .little),
  94. .internal_file_attributes = std.mem.readInt(u16, bytes[36..38], .little),
  95. .external_file_attributes = std.mem.readInt(u32, bytes[38..42], .little),
  96. .local_file_header_offset = std.mem.readInt(u32, bytes[42..46], .little),
  97. };
  98. }
  99. pub fn serialize(self: CentralDirectoryFileHeader) [46]u8 {
  100. var result: [46]u8 = undefined;
  101. result[0..4].* = self.signature;
  102. std.mem.writeInt(u16, result[4..6], self.version, .little);
  103. std.mem.writeInt(u16, result[6..8], self.minimum_version, .little);
  104. std.mem.writeInt(u16, result[8..10], self.flags, .little);
  105. std.mem.writeInt(u16, result[10..12], @intFromEnum(self.compression_method), .little);
  106. std.mem.writeInt(u16, result[12..14], self.last_modification_time, .little);
  107. std.mem.writeInt(u16, result[14..16], self.last_modification_date, .little);
  108. std.mem.writeInt(u32, result[16..20], self.crc32, .little);
  109. std.mem.writeInt(u32, result[20..24], self.compressed_size, .little);
  110. std.mem.writeInt(u32, result[24..28], self.uncompressed_size, .little);
  111. std.mem.writeInt(u16, result[28..30], self.filename_len, .little);
  112. std.mem.writeInt(u16, result[30..32], self.extra_len, .little);
  113. std.mem.writeInt(u16, result[32..34], self.comment_len, .little);
  114. std.mem.writeInt(u16, result[34..36], self.disk_number, .little);
  115. std.mem.writeInt(u16, result[36..38], self.internal_file_attributes, .little);
  116. std.mem.writeInt(u32, result[38..42], self.external_file_attributes, .little);
  117. std.mem.writeInt(u32, result[42..46], self.local_file_header_offset, .little);
  118. return result;
  119. }
  120. };
  121. pub const EndOfCentralDirectoryRecord = struct {
  122. disk_number: u16,
  123. central_directory_disk_number: u16,
  124. record_count_disk: u16,
  125. record_count_total: u16,
  126. central_directory_size: u32,
  127. central_directory_offset: u32,
  128. comment_len: u16,
  129. pub fn read(bytes: [22]u8) EndOfCentralDirectoryRecord {
  130. return EndOfCentralDirectoryRecord{
  131. .disk_number = std.mem.readInt(u16, bytes[4..6], .little),
  132. .central_directory_disk_number = std.mem.readInt(u16, bytes[6..8], .little),
  133. .record_count_disk = std.mem.readInt(u16, bytes[8..10], .little),
  134. .record_count_total = std.mem.readInt(u16, bytes[10..12], .little),
  135. .central_directory_size = std.mem.readInt(u32, bytes[12..16], .little),
  136. .central_directory_offset = std.mem.readInt(u32, bytes[16..20], .little),
  137. .comment_len = std.mem.readInt(u16, bytes[20..22], .little),
  138. };
  139. }
  140. pub fn serialize(self: EndOfCentralDirectoryRecord) [22]u8 {
  141. var result: [22]u8 = undefined;
  142. result[0..4].* = end_of_central_directory_sig;
  143. std.mem.writeInt(u16, result[4..6], self.disk_number, .little);
  144. std.mem.writeInt(u16, result[6..8], self.central_directory_disk_number, .little);
  145. std.mem.writeInt(u16, result[8..10], self.record_count_disk, .little);
  146. std.mem.writeInt(u16, result[10..12], self.record_count_total, .little);
  147. std.mem.writeInt(u32, result[12..16], self.central_directory_size, .little);
  148. std.mem.writeInt(u32, result[16..20], self.central_directory_offset, .little);
  149. std.mem.writeInt(u16, result[20..22], self.comment_len, .little);
  150. return result;
  151. }
  152. };
  153. pub fn findEocdr(file: std.fs.File) ![22]u8 {
  154. // The EOCD record can contain a variable-length comment at the end,
  155. // which makes ZIP file parsing ambiguous in general, since a valid
  156. // comment could contain the bytes of another valid EOCD record.
  157. // Here we just search backwards for the first instance of the EOCD
  158. // signature, and return an error if a valid EOCD record doesn't follow.
  159. // TODO: make this more efficient
  160. // we need a backward_buffered_reader
  161. const file_size = try file.getEndPos();
  162. const record_len = 22;
  163. var record: [record_len]u8 = undefined;
  164. if (file_size < record_len)
  165. return error.ZipTruncated;
  166. try file.seekFromEnd(-record_len);
  167. {
  168. const len = try file.readAll(&record);
  169. if (len != record_len)
  170. return error.ZipTruncated;
  171. }
  172. var comment_len: u16 = 0;
  173. while (true) {
  174. if (std.mem.eql(u8, record[0..4], &end_of_central_directory_sig) and
  175. std.mem.readInt(u16, record[20..22], .little) == comment_len)
  176. {
  177. break;
  178. }
  179. if (comment_len == std.math.maxInt(u16))
  180. return error.ZipMissingEocdr;
  181. std.mem.copyBackwards(u8, record[1..], record[0 .. record.len - 1]);
  182. comment_len += 1;
  183. if (@as(u64, record_len) + @as(u64, comment_len) > file_size)
  184. return error.ZipMissingEocdr;
  185. try file.seekFromEnd(-record_len - @as(i64, comment_len));
  186. {
  187. const len = try file.readAll(record[0..1]);
  188. if (len != 1)
  189. return error.ZipTruncated;
  190. }
  191. }
  192. return record;
  193. }
  194. fn LimitedReader(comptime UnderlyingReader: type) type {
  195. return struct {
  196. const Self = @This();
  197. underlying_reader: UnderlyingReader,
  198. remaining: usize,
  199. pub const Error = UnderlyingReader.Error;
  200. pub const Reader = std.io.Reader(*Self, Error, read);
  201. fn read(self: *Self, buffer: []u8) Error!usize {
  202. const next_read_len = @min(buffer.len, self.remaining);
  203. if (next_read_len == 0) return 0;
  204. const len = try self.underlying_reader.read(buffer[0..next_read_len]);
  205. self.remaining -= len;
  206. return len;
  207. }
  208. pub fn reader(self: *Self) Reader {
  209. return Reader{ .context = self };
  210. }
  211. };
  212. }
  213. fn limitedReader(reader: anytype, limit: usize) LimitedReader(@TypeOf(reader)) {
  214. return .{
  215. .underlying_reader = reader,
  216. .remaining = limit,
  217. };
  218. }
  219. /// `decompress` returns the actual CRC-32 of the decompressed bytes,
  220. /// which should be validated against the expected entry.crc32 value.
  221. /// `writer` can be anything with a `writeAll(self: *Self, chunk: []const u8) anyerror!void` method.
  222. pub fn decompress(
  223. method: CompressionMethod,
  224. uncompressed_size: u32,
  225. reader: anytype,
  226. writer: anytype,
  227. ) !u32 {
  228. var hash = std.hash.Crc32.init();
  229. switch (method) {
  230. .store => {
  231. var buf: [std.mem.page_size]u8 = undefined;
  232. while (true) {
  233. const len = try reader.read(&buf);
  234. if (len == 0) break;
  235. try writer.writeAll(buf[0..len]);
  236. hash.update(buf[0..len]);
  237. }
  238. },
  239. .deflate, .deflate64 => {
  240. var br = std.io.bufferedReader(reader);
  241. var total_uncompressed: u32 = 0;
  242. var decompressor = std.compress.flate.decompressor(br.reader());
  243. while (try decompressor.next()) |chunk| {
  244. try writer.writeAll(chunk);
  245. hash.update(chunk);
  246. total_uncompressed += @intCast(chunk.len);
  247. }
  248. if (br.end != br.start)
  249. return error.ZipDeflateTruncated;
  250. if (total_uncompressed != uncompressed_size)
  251. return error.ZipUncompressSizeMismatch;
  252. },
  253. _ => return error.UnsupportedCompressionMethod,
  254. }
  255. return hash.final();
  256. }
  257. pub const Iterator = struct {
  258. file: std.fs.File,
  259. eocdr: EndOfCentralDirectoryRecord,
  260. next_central_header_index: u16,
  261. next_central_header_offset: u64,
  262. pub fn init(file: std.fs.File) !Iterator {
  263. const eocdr = blk: {
  264. const eocdr_bytes = try findEocdr(file);
  265. break :blk EndOfCentralDirectoryRecord.read(eocdr_bytes);
  266. };
  267. // Don't support multi-disk archives.
  268. if (eocdr.disk_number != 0 or
  269. eocdr.central_directory_disk_number != 0 or
  270. eocdr.record_count_disk != eocdr.record_count_total)
  271. {
  272. return error.ZipUnsupportedMultiDisk;
  273. }
  274. return .{
  275. .file = file,
  276. .eocdr = eocdr,
  277. .next_central_header_offset = 0,
  278. .next_central_header_index = 0,
  279. };
  280. }
  281. pub fn next(self: *Iterator) !?Entry {
  282. if (self.next_central_header_index >= self.eocdr.record_count_total) {
  283. return null;
  284. }
  285. const header_file_offset: u64 = @as(u64, self.eocdr.central_directory_offset) + self.next_central_header_offset;
  286. const header = blk: {
  287. try self.file.seekTo(header_file_offset);
  288. var header: [46]u8 = undefined;
  289. const len = try self.file.readAll(&header);
  290. if (len != header.len)
  291. return error.ZipTruncated;
  292. break :blk CentralDirectoryFileHeader.deserialize(header);
  293. };
  294. if (!std.mem.eql(u8, &header.signature, &central_file_header_sig))
  295. return error.ZipHeader;
  296. self.next_central_header_index += 1;
  297. self.next_central_header_offset += 46 + header.filename_len + header.extra_len + header.comment_len;
  298. if (header.disk_number != 0)
  299. return error.ZipUnsupportedMultiDisk;
  300. return .{
  301. .header_file_offset = header_file_offset,
  302. .header = header,
  303. };
  304. }
  305. pub const Entry = struct {
  306. header_file_offset: u64,
  307. header: CentralDirectoryFileHeader,
  308. pub fn extract(self: Entry, zip_file: std.fs.File, filename_buf: []u8, dest: std.fs.Dir) !u32 {
  309. if (filename_buf.len < self.header.filename_len)
  310. return error.ZipInsufficientBuffer;
  311. const filename = filename_buf[0..self.header.filename_len];
  312. try zip_file.seekTo(self.header_file_offset + 46);
  313. {
  314. const len = try zip_file.readAll(filename);
  315. if (len != filename.len)
  316. return error.ZipTruncated;
  317. }
  318. const local_data_header_offset: u64 = local_data_header_offset: {
  319. const local_header = blk: {
  320. try zip_file.seekTo(self.header.local_file_header_offset);
  321. var local_header: [30]u8 = undefined;
  322. const len = try zip_file.readAll(&local_header);
  323. if (len != local_header.len)
  324. return error.ZipTruncated;
  325. break :blk LocalFileHeader.deserialize(local_header);
  326. };
  327. if (!std.mem.eql(u8, &local_header.signature, &local_file_header_sig))
  328. return error.ZipHeader;
  329. // TODO: verify minimum_version
  330. // TODO: verify flags
  331. // TODO: verify compression method
  332. // TODO: verify last_mod_time
  333. // TODO: verify last_mod_date
  334. // TODO: verify filename_len and filename?
  335. // TODO: extra?
  336. if (local_header.crc32 != 0 and local_header.crc32 != self.header.crc32)
  337. return error.ZipRedundancyFail;
  338. if (local_header.compressed_size != 0 and
  339. local_header.compressed_size != self.header.compressed_size)
  340. return error.ZipRedundancyFail;
  341. if (local_header.uncompressed_size != 0 and
  342. local_header.uncompressed_size != self.header.uncompressed_size)
  343. return error.ZipRedundancyFail;
  344. break :local_data_header_offset @as(u64, local_header.filename_len) +
  345. @as(u64, local_header.extra_len);
  346. };
  347. if (filename.len == 0 or filename[0] == '/')
  348. return error.ZipBadFilename;
  349. // All entries that end in '/' are directories
  350. if (filename[filename.len - 1] == '/') {
  351. if (self.header.uncompressed_size != 0)
  352. return error.ZipBadDirectorySize;
  353. try dest.makePath(filename[0 .. filename.len - 1]);
  354. return std.hash.Crc32.hash(&.{});
  355. }
  356. const out_file = blk: {
  357. if (std.fs.path.dirname(filename)) |dirname| {
  358. var parent_dir = try dest.makeOpenPath(dirname, .{});
  359. defer parent_dir.close();
  360. const basename = std.fs.path.basename(filename);
  361. break :blk try parent_dir.createFile(basename, .{ .exclusive = true });
  362. }
  363. break :blk try dest.createFile(filename, .{ .exclusive = true });
  364. };
  365. defer out_file.close();
  366. const local_data_file_offset: u64 =
  367. @as(u64, self.header.local_file_header_offset) +
  368. @as(u64, 30) +
  369. local_data_header_offset;
  370. try zip_file.seekTo(local_data_file_offset);
  371. var limited_reader = limitedReader(zip_file.reader(), self.header.compressed_size);
  372. const crc = try decompress(
  373. self.header.compression_method,
  374. self.header.uncompressed_size,
  375. limited_reader.reader(),
  376. out_file.writer(),
  377. );
  378. if (limited_reader.remaining != 0)
  379. return error.ZipDecompressTruncated;
  380. return crc;
  381. }
  382. };
  383. };
  384. pub fn pipeToFileSystem(dest: std.fs.Dir, file: std.fs.File) !void {
  385. var iter = try Iterator.init(file);
  386. var filename_buf: [std.fs.MAX_PATH_BYTES]u8 = undefined;
  387. while (try iter.next()) |entry| {
  388. const crc32 = try entry.extract(file, &filename_buf, dest);
  389. if (crc32 != entry.header.crc32)
  390. return error.ZipCrcMismatch;
  391. }
  392. }
  393. fn testZip(comptime files: []const File) !void {
  394. var cache: [files.len]FileCache = undefined;
  395. try testZipWithCache(files, &cache);
  396. }
  397. fn testZipWithCache(files: []const File, cache: []FileCache) !void {
  398. var tmp = testing.tmpDir(.{ .no_follow = true });
  399. defer tmp.cleanup();
  400. const dir = tmp.dir;
  401. {
  402. var file = try dir.createFile("zip", .{});
  403. defer file.close();
  404. try writeFile(file, files, cache);
  405. }
  406. var zip_file = try dir.openFile("zip", .{});
  407. defer zip_file.close();
  408. try pipeToFileSystem(dir, zip_file);
  409. for (files) |test_file| {
  410. var file = try dir.openFile(test_file.name, .{});
  411. defer file.close();
  412. var buf: [4096]u8 = undefined;
  413. const n = try file.reader().readAll(&buf);
  414. try testing.expectEqualStrings(test_file.content, buf[0..n]);
  415. }
  416. }
  417. test "zip one file" {
  418. try testZip(&[_]File{
  419. .{ .name = "onefile.txt", .content = "Just a single file\n", .compression = .store },
  420. });
  421. }
  422. test "zip multiple files" {
  423. try testZip(&[_]File{
  424. .{ .name = "foo", .content = "a foo file\n", .compression = .store },
  425. .{ .name = "subdir/bar", .content = "bar is this right?\nanother newline\n", .compression = .store },
  426. .{ .name = "subdir/another/baz", .content = "bazzy mc bazzerson", .compression = .store },
  427. });
  428. }
  429. test "zip deflated" {
  430. try testZip(&[_]File{
  431. .{ .name = "deflateme", .content = "This is a deflated file.\nIt should be smaller in the Zip file1\n", .compression = .deflate },
  432. .{ .name = "deflateme64", .content = "The 64k version of deflate!\n", .compression = .deflate64 },
  433. .{ .name = "raw", .content = "Not all files need to be deflated in the same Zip.\n", .compression = .store },
  434. });
  435. }