aboutsummaryrefslogtreecommitdiffhomepage
path: root/org/in_zip_pod.org
blob: f2903c427fda33c345b175820efe84420016b4db (plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
-*- mode: org -*-
#+TITLE:       sisudoc spine (doc_reform) markup source zip pod
#+DESCRIPTION: documents - structuring, publishing in multiple formats & search
#+FILETAGS:    :spine:sourcefile:read:
#+AUTHOR:      Ralph Amissah
#+EMAIL:       [[mailto:ralph.amissah@gmail.com][ralph.amissah@gmail.com]]
#+COPYRIGHT:   Copyright (C) 2015 (continuously updated, current 2026) Ralph Amissah
#+LANGUAGE:    en
#+STARTUP:     content hideblocks hidestars noindent entitiespretty
#+PROPERTY:    header-args+ :eval never-export :exports code
#+PROPERTY:    header-args+ :noweb yes :padline no
#+PROPERTY:    header-args+ :results silent :cache no
#+PROPERTY:    header-args+ :mkdirp yes
#+OPTIONS:     H:3 num:nil toc:t \n:t ::t |:t ^:nil -:t f:t *:t
- magic single double-quote → " ← FIX changes hilighting behavior (occuring
  after it) in org document. INVESTIGATE (org-mode CONFIG?) FIND & FIX

- [[./doc-reform.org][doc-reform.org]]  [[./][org/]]

* read zip

#+HEADER: :tangle "../src/sisudoc/io_in/read_zip_pod.d"
#+HEADER: :noweb yes
#+BEGIN_SRC d
<<doc_header_including_copyright_and_license>>
/++
  module read_zip_pod;<BR>
  - extract pod zip archives to temp directory for processing<BR>
  - validate zip entries for security (path traversal, size limits)
+/
module sisudoc.io_in.read_zip_pod;
@safe:
template spineExtractZipPod() {
  import std.algorithm : canFind;
  import std.array : array;
  import std.conv : to;
  import std.file;
  import std.path;
  import std.regex;
  import std.stdio;
  import std.string : indexOf;

  /+ security limits for zip extraction +/
  enum size_t MAX_ENTRY_SIZE     = 50 * 1024 * 1024;   /+ 50 MB per entry +/
  enum size_t MAX_TOTAL_SIZE     = 500 * 1024 * 1024;  /+ 500 MB total +/
  enum size_t MAX_ENTRY_COUNT    = 500;                 /+ max entries in archive +/
  enum size_t MAX_PATH_DEPTH     = 10;                  /+ max path components +/

  /+ allowed entry name pattern: alphanumeric, dots, dashes, underscores, forward slashes +/
  static auto rgx_safe_entry_name = ctRegex!(`^[a-zA-Z0-9._/ -]+$`);

  struct ZipPodResult {
    string tmp_dir;          /+ temp directory containing extracted pod +/
    string pod_dir;          /+ path to pod directory within tmp_dir +/
    bool   ok;               /+ extraction succeeded +/
    string error_msg;        /+ error description if !ok +/
  }

  /+ ↓ validate a single zip entry name for security +/
  string validateEntryName(string name) {
    /+ reject empty names +/
    if (name.length == 0)
      return "empty entry name";
    /+ reject absolute paths +/
    if (name[0] == '/')
      return "absolute path in zip entry: " ~ name;
    /+ reject path traversal +/
    if (name.canFind(".."))
      return "path traversal in zip entry: " ~ name;
    /+ reject null bytes +/
    if (name.indexOf('\0') >= 0)
      return "null byte in zip entry name: " ~ name;
    /+ reject backslashes (windows path separator tricks) +/
    if (name.canFind("\\"))
      return "backslash in zip entry: " ~ name;
    /+ check path depth +/
    size_t depth = 0;
    foreach (c; name) {
      if (c == '/') depth++;
    }
    if (depth > MAX_PATH_DEPTH)
      return "path too deep in zip entry: " ~ name;
    /+ check allowed characters +/
    if (!(name.matchFirst(rgx_safe_entry_name)))
      return "disallowed characters in zip entry: " ~ name;
    return "";  /+ empty string means valid +/
  }

  /+ ↓ extract zip pod to temp directory, returns ZipPodResult +/
  @trusted ZipPodResult extractZipPod(string zip_path) {
    import std.zip;
    ZipPodResult result;
    result.ok = false;
    /+ ↓ verify zip file exists +/
    if (!exists(zip_path) || !zip_path.isFile) {
      result.error_msg = "zip file not found: " ~ zip_path;
      return result;
    }
    /+ ↓ derive pod name from zip filename +/
    string zip_basename = zip_path.baseName.stripExtension;
    /+ ↓ read and parse zip archive +/
    ZipArchive zip;
    try {
      zip = new ZipArchive(read(zip_path));
    } catch (ZipException ex) {
      result.error_msg = "failed to read zip archive: " ~ zip_path ~ " - " ~ ex.msg;
      return result;
    } catch (Exception ex) {
      result.error_msg = "error reading zip file: " ~ zip_path ~ " - " ~ ex.msg;
      return result;
    }
    /+ ↓ validate entry count +/
    if (zip.directory.length > MAX_ENTRY_COUNT) {
      result.error_msg = "zip archive has too many entries ("
        ~ zip.directory.length.to!string ~ " > " ~ MAX_ENTRY_COUNT.to!string ~ "): " ~ zip_path;
      return result;
    }
    /+ ↓ validate all entries before extracting any +/
    size_t total_size = 0;
    foreach (name, member; zip.directory) {
      /+ validate entry name +/
      string name_err = validateEntryName(name);
      if (name_err.length > 0) {
        result.error_msg = name_err;
        return result;
      }
      /+ check per-entry size +/
      if (member.expandedSize > MAX_ENTRY_SIZE) {
        result.error_msg = "zip entry too large ("
          ~ member.expandedSize.to!string ~ " bytes): " ~ name;
        return result;
      }
      /+ check total size +/
      total_size += member.expandedSize;
      if (total_size > MAX_TOTAL_SIZE) {
        result.error_msg = "zip archive total size exceeds limit ("
          ~ MAX_TOTAL_SIZE.to!string ~ " bytes): " ~ zip_path;
        return result;
      }
    }
    /+ ↓ create temp directory +/
    string tmp_base = tempDir.buildPath("spine-zip-pod");
    try {
      if (!exists(tmp_base))
        mkdirRecurse(tmp_base);
    } catch (FileException ex) {
      result.error_msg = "failed to create temp base directory: " ~ ex.msg;
      return result;
    }
    /+ pod directory inside temp: tmp_base/pod_name/ +/
    string pod_dir = tmp_base.buildPath(zip_basename);
    try {
      if (exists(pod_dir))
        rmdirRecurse(pod_dir);
      mkdirRecurse(pod_dir);
    } catch (FileException ex) {
      result.error_msg = "failed to create temp pod directory: " ~ ex.msg;
      return result;
    }
    /+ ↓ extract entries +/
    /+ zip internal structure uses paths like:
       pod.manifest, conf/dr_document_make,
       pod/media/text/en/filename.sst, image/filename.png
       but the extracted pod directory needs to look like a normal pod:
       pod.manifest, conf/dr_document_make,
       media/text/en/filename.sst, image/filename.png
       The "pod/" prefix in zip entries for text files maps to the pod root.
    +/
    /+ ↓ pre-compute canonical pod path for containment checks +/
    auto canonical_pod = (pod_dir.asNormalizedPath).array.to!string ~ "/";
    foreach (name, member; zip.directory) {
      /+ skip directory entries +/
      if (name.length > 0 && name[$-1] == '/')
        continue;
      /+ ↓ map zip internal path to filesystem path +/
      /+ entries with "pod/" prefix: strip it so media/text/en/file.sst ends up at pod_dir/media/text/en/file.sst +/
      string entry_path = name;
      if (entry_path.length > 4 && entry_path[0..4] == "pod/") {
        entry_path = entry_path[4..$];
      }
      string out_path = pod_dir.buildPath(entry_path);
      /+ ↓ verify resolved path is within pod_dir (defense in depth) +/
      auto canonical_out = (out_path.asNormalizedPath).array.to!string;
      if (canonical_out.length < canonical_pod.length
        || canonical_out[0..canonical_pod.length] != canonical_pod)
      {
        result.error_msg = "zip entry escapes extraction directory: " ~ name;
        try { rmdirRecurse(pod_dir); } catch (FileException) {}
        return result;
      }
      /+ ↓ create parent directories +/
      string parent = out_path.dirName;
      try {
        if (!exists(parent))
          mkdirRecurse(parent);
      } catch (FileException ex) {
        result.error_msg = "failed to create directory for: " ~ name ~ " - " ~ ex.msg;
        try { rmdirRecurse(pod_dir); } catch (FileException) {}
        return result;
      }
      /+ ↓ decompress and write file +/
      try {
        auto data = zip.expand(member);
        std.file.write(out_path, data);
      } catch (Exception ex) {
        result.error_msg = "failed to extract: " ~ name ~ " - " ~ ex.msg;
        try { rmdirRecurse(pod_dir); } catch (FileException) {}
        return result;
      }
    }
    /+ ↓ verify no symlinks were created (defense in depth) +/
    string symlink_err = checkForSymlinks(pod_dir);
    if (symlink_err.length > 0) {
      result.error_msg = symlink_err;
      try { rmdirRecurse(pod_dir); } catch (FileException) {}
      return result;
    }
    /+ ↓ verify pod.manifest exists in extracted content +/
    if (!exists(pod_dir.buildPath("pod.manifest"))) {
      result.error_msg = "zip archive does not contain pod.manifest: " ~ zip_path;
      try { rmdirRecurse(pod_dir); } catch (FileException) {}
      return result;
    }
    result.tmp_dir = tmp_base;
    result.pod_dir = pod_dir;
    result.ok = true;
    return result;
  }

  /+ ↓ recursively check for symlinks in extracted directory +/
  @trusted string checkForSymlinks(string dir_path) {
    try {
      foreach (entry; dirEntries(dir_path, SpanMode.depth)) {
        if (entry.isSymlink) {
          return "symlink found in zip extraction: " ~ entry.name;
        }
      }
    } catch (FileException ex) {
      return "error checking for symlinks: " ~ ex.msg;
    }
    return "";
  }

  /+ ↓ download a zip pod from a URL to a temp file +/
  enum size_t MAX_DOWNLOAD_SIZE  = 200 * 1024 * 1024;  /+ 200 MB download limit +/
  enum int    DOWNLOAD_TIMEOUT   = 120;                 /+ seconds +/

  static auto rgx_url_zip = ctRegex!(`^https?://[a-zA-Z0-9._:/-]+[.]zip$`);

  struct DownloadResult {
    string local_path;       /+ path to downloaded temp file +/
    bool   ok;
    string error_msg;
  }

  bool isUrl(string arg) {
    return arg.length > 8
      && (arg[0..8] == "https://" || arg[0..7] == "http://");
  }

  @trusted DownloadResult downloadZipUrl(string url) {
    import std.process : execute, environment;
    DownloadResult result;
    result.ok = false;
    /+ ↓ validate URL scheme +/
    if (url.length < 8 || (url[0..8] != "https://" && url[0..7] != "http://")) {
      result.error_msg = "only http/https URLs are supported: " ~ url;
      return result;
    }
    if (url[0..7] == "http://" && url[0..8] != "https://") {
      stderr.writeln("WARNING: downloading over insecure http: ", url);
    }
    /+ ↓ validate URL format +/
    if (!(url.matchFirst(rgx_url_zip))) {
      result.error_msg = "URL does not match expected zip URL pattern: " ~ url;
      return result;
    }
    /+ ↓ reject URLs that could target internal services +/
    {
      import std.uni : toLower;
      string url_lower = url.toLower;
      /+ strip scheme to get host portion +/
      string after_scheme = (url_lower[0..8] == "https://")
        ? url_lower[8..$]
        : url_lower[7..$];
      /+ extract host (up to first / or :) +/
      string host;
      foreach (i, c; after_scheme) {
        if (c == '/' || c == ':') {
          host = after_scheme[0..i];
          break;
        }
      }
      if (host.length == 0) host = after_scheme;
      if (host == "localhost"
        || host == "127.0.0.1"
        || host == "::1"
        || host == "[::1]"
        || host == "0.0.0.0"
        || host.canFind("169.254.")
        || host.canFind("10.")
        || host.canFind("192.168.")
      ) {
        result.error_msg = "URL targets a local/private address: " ~ url;
        return result;
      }
    }
    /+ ↓ derive filename from URL +/
    string url_basename = url.baseName;
    if (url_basename.length == 0 || url_basename.indexOf('.') < 0) {
      result.error_msg = "cannot determine filename from URL: " ~ url;
      return result;
    }
    /+ ↓ create temp directory for download +/
    string tmp_base = tempDir.buildPath("spine-zip-pod");
    try {
      if (!exists(tmp_base))
        mkdirRecurse(tmp_base);
    } catch (FileException ex) {
      result.error_msg = "failed to create temp directory: " ~ ex.msg;
      return result;
    }
    string tmp_file = tmp_base.buildPath(url_basename);
    /+ ↓ download using curl +/
    auto curl_result = execute([
      "curl",
      "--silent", "--show-error",
      "--fail",                            /+ fail on HTTP errors +/
      "--location",                        /+ follow redirects +/
      "--max-redirs", "5",                 /+ limit redirects +/
      "--max-time", DOWNLOAD_TIMEOUT.to!string,
      "--max-filesize", MAX_DOWNLOAD_SIZE.to!string,
      "--proto", "=https,http",            /+ restrict protocols +/
      "--output", tmp_file,
      url
    ]);
    if (curl_result.status != 0) {
      result.error_msg = "download failed: " ~ url;
      if (curl_result.output.length > 0)
        result.error_msg ~= " - " ~ curl_result.output;
      /+ clean up partial download +/
      try { if (exists(tmp_file)) remove(tmp_file); } catch (FileException) {}
      return result;
    }
    if (!exists(tmp_file) || !tmp_file.isFile) {
      result.error_msg = "download produced no file: " ~ url;
      return result;
    }
    result.local_path = tmp_file;
    result.ok = true;
    return result;
  }

  /+ ↓ clean up a downloaded temp file +/
  void cleanupDownload(ref DownloadResult dlr) {
    if (dlr.local_path.length > 0 && exists(dlr.local_path)) {
      try {
        remove(dlr.local_path);
      } catch (FileException ex) {
        stderr.writeln("WARNING: failed to clean up downloaded file: ", dlr.local_path);
      }
    }
    dlr.ok = false;
  }

  /+ ↓ clean up extracted temp directory +/
  void cleanupZipPod(ref ZipPodResult zpr) {
    if (zpr.pod_dir.length > 0 && exists(zpr.pod_dir)) {
      try {
        rmdirRecurse(zpr.pod_dir);
      } catch (FileException ex) {
        stderr.writeln("WARNING: failed to clean up temp zip extraction: ", zpr.pod_dir);
      }
    }
    zpr.ok = false;
  }
}
#+END_SRC

* org includes
** project version

#+NAME: spine_version
#+HEADER: :noweb yes
#+BEGIN_SRC emacs-lisp
<<./sisudoc_spine_version_info_and_doc_header_including_copyright_and_license.org:spine_project_version()>>
#+END_SRC

** year

#+NAME: year
#+HEADER: :noweb yes
#+BEGIN_SRC emacs-lisp
<<./sisudoc_spine_version_info_and_doc_header_including_copyright_and_license.org:year()>>
#+END_SRC

** document header including copyright & license

#+NAME: doc_header_including_copyright_and_license
#+HEADER: :noweb yes
#+BEGIN_SRC emacs-lisp
<<./sisudoc_spine_version_info_and_doc_header_including_copyright_and_license.org:spine_doc_header_including_copyright_and_license()>>
#+END_SRC

* __END__