aboutsummaryrefslogtreecommitdiffhomepage
path: root/src/sisudoc/io_in/read_zip_pod.d
diff options
context:
space:
mode:
authorRalph Amissah <ralph.amissah@gmail.com>2026-04-13 15:33:07 -0400
committerRalph Amissah <ralph.amissah@gmail.com>2026-04-13 16:25:56 -0400
commitd0ac448e6425c9e4246cd529aeb11643dce8093f (patch)
treec12356fbc55cffd495cc37b81ca6fb125e3be195 /src/sisudoc/io_in/read_zip_pod.d
parentpackage.nix cosmetic line-breaks for build command (diff)
spine may be run against a document-markup zip pod
- claude contributed src - Opens the zip with std.zip.ZipArchive (reads the whole file into memory) - Locates pod.manifest inside the archive to discover document paths and languages - Extracts markup files (.sst/.ssm/.ssi) as in-memory strings - Extracts images as in-memory byte arrays - Extracts conf/dr_document_make if present - Presents these to the existing pipeline as if they were read from the filesystem - Some security mitigations: - Zip Slip / Path Traversal: Reject entries containing `..` or starting with `/`; canonicalize resolved paths and verify they fall within extraction root - Zip Bomb: Check `ArchiveMember.size` before extracting; enforce per-file (50MB) and total size limits (500MB) - Entry Count: Limit number of entries (a pod should have at most ~100 files) - Path depth: limit (Maximum 10 path components). - Symlinks: Verify no symlinks in extracted content before processing (post-extraction recursive scan) - Filename Validation: Only allow expected characters; reject null bytes - Malformed Zips: Catch `ZipException` from `std.zip.ZipArchive` constructor - Cleanup on error
Diffstat (limited to 'src/sisudoc/io_in/read_zip_pod.d')
-rw-r--r--src/sisudoc/io_in/read_zip_pod.d279
1 files changed, 279 insertions, 0 deletions
diff --git a/src/sisudoc/io_in/read_zip_pod.d b/src/sisudoc/io_in/read_zip_pod.d
new file mode 100644
index 0000000..38480cd
--- /dev/null
+++ b/src/sisudoc/io_in/read_zip_pod.d
@@ -0,0 +1,279 @@
+/+
+- Name: SisuDoc Spine, Doc Reform [a part of]
+ - Description: documents, structuring, processing, publishing, search
+ - static content generator
+
+ - Author: Ralph Amissah
+ [ralph.amissah@gmail.com]
+
+ - Copyright: (C) 2015 (continuously updated, current 2026) Ralph Amissah, All Rights Reserved.
+
+ - License: AGPL 3 or later:
+
+ Spine (SiSU), a framework for document structuring, publishing and
+ search
+
+ Copyright (C) Ralph Amissah
+
+ This program is free software: you can redistribute it and/or modify it
+ under the terms of the GNU AFERO General Public License as published by the
+ Free Software Foundation, either version 3 of the License, or (at your
+ option) any later version.
+
+ This program is distributed in the hope that it will be useful, but WITHOUT
+ ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for
+ more details.
+
+ You should have received a copy of the GNU General Public License along with
+ this program. If not, see [https://www.gnu.org/licenses/].
+
+ If you have Internet connection, the latest version of the AGPL should be
+ available at these locations:
+ [https://www.fsf.org/licensing/licenses/agpl.html]
+ [https://www.gnu.org/licenses/agpl.html]
+
+ - Spine (by Doc Reform, related to SiSU) uses standard:
+ - docReform markup syntax
+ - standard SiSU markup syntax with modified headers and minor modifications
+ - docReform object numbering
+ - standard SiSU object citation numbering & system
+
+ - Homepages:
+ [https://www.sisudoc.org]
+ [https://www.doc-reform.org]
+
+ - Git
+ [https://git.sisudoc.org/]
+
++/
+/++
+ module read_zip_pod;<BR>
+ - extract pod zip archives to temp directory for processing<BR>
+ - validate zip entries for security (path traversal, size limits)
++/
+module sisudoc.io_in.read_zip_pod;
+@safe:
+template spineExtractZipPod() {
+ import std.algorithm : canFind;
+ import std.array : array;
+ import std.conv : to;
+ import std.file;
+ import std.path;
+ import std.regex;
+ import std.stdio;
+ import std.string : indexOf;
+
+ /+ security limits for zip extraction +/
+ enum size_t MAX_ENTRY_SIZE = 50 * 1024 * 1024; /+ 50 MB per entry +/
+ enum size_t MAX_TOTAL_SIZE = 500 * 1024 * 1024; /+ 500 MB total +/
+ enum size_t MAX_ENTRY_COUNT = 500; /+ max entries in archive +/
+ enum size_t MAX_PATH_DEPTH = 10; /+ max path components +/
+
+ /+ allowed entry name pattern: alphanumeric, dots, dashes, underscores, forward slashes +/
+ static auto rgx_safe_entry_name = ctRegex!(`^[a-zA-Z0-9._/ -]+$`);
+
+ struct ZipPodResult {
+ string tmp_dir; /+ temp directory containing extracted pod +/
+ string pod_dir; /+ path to pod directory within tmp_dir +/
+ bool ok; /+ extraction succeeded +/
+ string error_msg; /+ error description if !ok +/
+ }
+
+ /+ ↓ validate a single zip entry name for security +/
+ string validateEntryName(string name) {
+ /+ reject empty names +/
+ if (name.length == 0)
+ return "empty entry name";
+ /+ reject absolute paths +/
+ if (name[0] == '/')
+ return "absolute path in zip entry: " ~ name;
+ /+ reject path traversal +/
+ if (name.canFind(".."))
+ return "path traversal in zip entry: " ~ name;
+ /+ reject null bytes +/
+ if (name.indexOf('\0') >= 0)
+ return "null byte in zip entry name: " ~ name;
+ /+ reject backslashes (windows path separator tricks) +/
+ if (name.canFind("\\"))
+ return "backslash in zip entry: " ~ name;
+ /+ check path depth +/
+ size_t depth = 0;
+ foreach (c; name) {
+ if (c == '/') depth++;
+ }
+ if (depth > MAX_PATH_DEPTH)
+ return "path too deep in zip entry: " ~ name;
+ /+ check allowed characters +/
+ if (!(name.matchFirst(rgx_safe_entry_name)))
+ return "disallowed characters in zip entry: " ~ name;
+ return ""; /+ empty string means valid +/
+ }
+
+ /+ ↓ extract zip pod to temp directory, returns ZipPodResult +/
+ @trusted ZipPodResult extractZipPod(string zip_path) {
+ import std.zip;
+ ZipPodResult result;
+ result.ok = false;
+ /+ ↓ verify zip file exists +/
+ if (!exists(zip_path) || !zip_path.isFile) {
+ result.error_msg = "zip file not found: " ~ zip_path;
+ return result;
+ }
+ /+ ↓ derive pod name from zip filename +/
+ string zip_basename = zip_path.baseName.stripExtension;
+ /+ ↓ read and parse zip archive +/
+ ZipArchive zip;
+ try {
+ zip = new ZipArchive(read(zip_path));
+ } catch (ZipException ex) {
+ result.error_msg = "failed to read zip archive: " ~ zip_path ~ " - " ~ ex.msg;
+ return result;
+ } catch (Exception ex) {
+ result.error_msg = "error reading zip file: " ~ zip_path ~ " - " ~ ex.msg;
+ return result;
+ }
+ /+ ↓ validate entry count +/
+ if (zip.directory.length > MAX_ENTRY_COUNT) {
+ result.error_msg = "zip archive has too many entries ("
+ ~ zip.directory.length.to!string ~ " > " ~ MAX_ENTRY_COUNT.to!string ~ "): " ~ zip_path;
+ return result;
+ }
+ /+ ↓ validate all entries before extracting any +/
+ size_t total_size = 0;
+ foreach (name, member; zip.directory) {
+ /+ validate entry name +/
+ string name_err = validateEntryName(name);
+ if (name_err.length > 0) {
+ result.error_msg = name_err;
+ return result;
+ }
+ /+ check per-entry size +/
+ if (member.expandedSize > MAX_ENTRY_SIZE) {
+ result.error_msg = "zip entry too large ("
+ ~ member.expandedSize.to!string ~ " bytes): " ~ name;
+ return result;
+ }
+ /+ check total size +/
+ total_size += member.expandedSize;
+ if (total_size > MAX_TOTAL_SIZE) {
+ result.error_msg = "zip archive total size exceeds limit ("
+ ~ MAX_TOTAL_SIZE.to!string ~ " bytes): " ~ zip_path;
+ return result;
+ }
+ }
+ /+ ↓ create temp directory +/
+ string tmp_base = tempDir.buildPath("spine-zip-pod");
+ try {
+ if (!exists(tmp_base))
+ mkdirRecurse(tmp_base);
+ } catch (FileException ex) {
+ result.error_msg = "failed to create temp base directory: " ~ ex.msg;
+ return result;
+ }
+ /+ pod directory inside temp: tmp_base/pod_name/ +/
+ string pod_dir = tmp_base.buildPath(zip_basename);
+ try {
+ if (exists(pod_dir))
+ rmdirRecurse(pod_dir);
+ mkdirRecurse(pod_dir);
+ } catch (FileException ex) {
+ result.error_msg = "failed to create temp pod directory: " ~ ex.msg;
+ return result;
+ }
+ /+ ↓ extract entries +/
+ /+ zip internal structure uses paths like:
+ pod.manifest, conf/dr_document_make,
+ pod/media/text/en/filename.sst, image/filename.png
+ but the extracted pod directory needs to look like a normal pod:
+ pod.manifest, conf/dr_document_make,
+ media/text/en/filename.sst, image/filename.png
+ The "pod/" prefix in zip entries for text files maps to the pod root.
+ +/
+ /+ ↓ pre-compute canonical pod path for containment checks +/
+ auto canonical_pod = (pod_dir.asNormalizedPath).array.to!string ~ "/";
+ foreach (name, member; zip.directory) {
+ /+ skip directory entries +/
+ if (name.length > 0 && name[$-1] == '/')
+ continue;
+ /+ ↓ map zip internal path to filesystem path +/
+ /+ entries with "pod/" prefix: strip it so media/text/en/file.sst ends up at pod_dir/media/text/en/file.sst +/
+ string entry_path = name;
+ if (entry_path.length > 4 && entry_path[0..4] == "pod/") {
+ entry_path = entry_path[4..$];
+ }
+ string out_path = pod_dir.buildPath(entry_path);
+ /+ ↓ verify resolved path is within pod_dir (defense in depth) +/
+ auto canonical_out = (out_path.asNormalizedPath).array.to!string;
+ if (canonical_out.length < canonical_pod.length
+ || canonical_out[0..canonical_pod.length] != canonical_pod)
+ {
+ result.error_msg = "zip entry escapes extraction directory: " ~ name;
+ try { rmdirRecurse(pod_dir); } catch (FileException) {}
+ return result;
+ }
+ /+ ↓ create parent directories +/
+ string parent = out_path.dirName;
+ try {
+ if (!exists(parent))
+ mkdirRecurse(parent);
+ } catch (FileException ex) {
+ result.error_msg = "failed to create directory for: " ~ name ~ " - " ~ ex.msg;
+ try { rmdirRecurse(pod_dir); } catch (FileException) {}
+ return result;
+ }
+ /+ ↓ decompress and write file +/
+ try {
+ auto data = zip.expand(member);
+ std.file.write(out_path, data);
+ } catch (Exception ex) {
+ result.error_msg = "failed to extract: " ~ name ~ " - " ~ ex.msg;
+ try { rmdirRecurse(pod_dir); } catch (FileException) {}
+ return result;
+ }
+ }
+ /+ ↓ verify no symlinks were created (defense in depth) +/
+ string symlink_err = checkForSymlinks(pod_dir);
+ if (symlink_err.length > 0) {
+ result.error_msg = symlink_err;
+ try { rmdirRecurse(pod_dir); } catch (FileException) {}
+ return result;
+ }
+ /+ ↓ verify pod.manifest exists in extracted content +/
+ if (!exists(pod_dir.buildPath("pod.manifest"))) {
+ result.error_msg = "zip archive does not contain pod.manifest: " ~ zip_path;
+ try { rmdirRecurse(pod_dir); } catch (FileException) {}
+ return result;
+ }
+ result.tmp_dir = tmp_base;
+ result.pod_dir = pod_dir;
+ result.ok = true;
+ return result;
+ }
+
+ /+ ↓ recursively check for symlinks in extracted directory +/
+ @trusted string checkForSymlinks(string dir_path) {
+ try {
+ foreach (entry; dirEntries(dir_path, SpanMode.depth)) {
+ if (entry.isSymlink) {
+ return "symlink found in zip extraction: " ~ entry.name;
+ }
+ }
+ } catch (FileException ex) {
+ return "error checking for symlinks: " ~ ex.msg;
+ }
+ return "";
+ }
+
+ /+ ↓ clean up extracted temp directory +/
+ void cleanupZipPod(ref ZipPodResult zpr) {
+ if (zpr.pod_dir.length > 0 && exists(zpr.pod_dir)) {
+ try {
+ rmdirRecurse(zpr.pod_dir);
+ } catch (FileException ex) {
+ stderr.writeln("WARNING: failed to clean up temp zip extraction: ", zpr.pod_dir);
+ }
+ }
+ zpr.ok = false;
+ }
+}