From a1fd226ef8ae434f81f010ee8681fc059dbbe6f2 Mon Sep 17 00:00:00 2001
From: Ralph Amissah <ralph@amissah.com>
Date: Thu, 26 Jul 2007 17:51:16 +0100
Subject: multiple url matching refinements, open archive initiative

---
 CHANGELOG                        | 13 ++++++-------
 lib/sisu/v0/html_tune.rb         |  4 ++--
 lib/sisu/v0/odf.rb               | 12 ++++++------
 lib/sisu/v0/shared_html_lite.rb  |  4 ++--
 lib/sisu/v0/shared_xml.rb        | 10 +++++-----
 lib/sisu/v0/texpdf_format.rb     |  4 ++--
 lib/sisu/v0/xml_md_oai_pmh_dc.rb |  5 +++++
 7 files changed, 28 insertions(+), 24 deletions(-)

diff --git a/CHANGELOG b/CHANGELOG
index 93d3ed72..2db96703 100644
--- a/CHANGELOG
+++ b/CHANGELOG
@@ -12,20 +12,18 @@ http://www.jus.uio.no/sisu/pkg/src/sisu_0.55.6.orig.tar.gz
   sisu_0.55.6-1.dsc
   sisu_0.55.6-1.diff.gz
 
-  * db html, fix related to match of multiple urls within paragraph
+  * matching of multiple urls within a paragraph
+    * db html (html_lite), bug fix
+    * multiple uls listed, refinement: html, html_lite, xml, odf, texpdf
 
   * open archive initiative for metadata harvesting, initial implementation,
     Dublin Core, XML output available (-O), decide use later (filenames, output
-    dir etc.), look at later and refine accordingly:
-      http://www.openarchives.org/pmh/
-      http://www.openarchives.org/OAI/2.0/openarchivesprotocol.htm#dublincore
-      http://es.dublincore.org/documents/usageguide/elements.shtml
-      http://dublincore.org/documents/dces/
-      see also http://dublincore.org/documents/dcmes-xml/
+    dir etc.)
 
   * debian vim
     * moved vim install back to addons
     * added recommends vim-addon-manager
+    (thanks zack)
 
 %% sisu_0.55.5.orig.tar.gz (2007-07-22:29/7)
 http://www.jus.uio.no/sisu/pkg/src/sisu_0.55.5.orig.tar.gz
@@ -47,6 +45,7 @@ http://www.jus.uio.no/sisu/pkg/src/sisu_0.55.5.orig.tar.gz
     open standards
 
   * debian vim, syntax and ftplugin install moved to /usr/share/vim-scripts
+    (syntax file synced with Bram, thanks)
 
 %% sisu_0.55.4.orig.tar.gz (2007-07-20:29/5)
 http://www.jus.uio.no/sisu/pkg/src/sisu_0.55.4.orig.tar.gz
diff --git a/lib/sisu/v0/html_tune.rb b/lib/sisu/v0/html_tune.rb
index 1d3461c3..66c45aed 100644
--- a/lib/sisu/v0/html_tune.rb
+++ b/lib/sisu/v0/html_tune.rb
@@ -325,9 +325,9 @@ module SiSU_Tune
           if (para =~/\b\S+\@\S+?\.\S+/ and para !~/(\"\S+\@\S+?\.\S+\"|>\S+\@\S+?\.\S+?<)/)
             para.gsub!(/\b(\S+\@\S+?\.\S+)(\s)/,'&lt;<a href="mailto:\1">\1</a>&gt;\2')
           end
-          para.gsub!(/(^|\s)[_\\]((?:https?|ftp):\/\/\S+?\.[^'"><\s]+?)([.,]?(?:\s|$))/,'\1<a href="\2" target="_top">\2</a>\3') #http ftp matches escaped, no decoration
+          para.gsub!(/\b[_\\]((?:https?|ftp):\/\/\S+?\.[^'"><\s]+?)([.,]?(?:\s|$))/,'<a href="\1" target="_top">\1</a>\2') #http ftp matches escaped, no decoration
           para.gsub!(/((?:^|\s)[}])((?:https?|ftp):\/\/\S+?\.[^'"><\s]+?)([.,]?(?:\s|$))/,'\1<a href="\2" target="_top">\2</a>\3') #special case \{ e.g. \}http://url
-          para.gsub!(/(^|\s)((?:https?|ftp):\/\/\S+?\.[^'"><\s]+?)([.,]?(?:\s|$))/,%{\\1#{@url_brace.xml_open}<a href="\\2" target="_top">\\2</a>#{@url_brace.xml_close}\\3}) #http ftp matches with decoration
+          para.gsub!(/\b((?:https?|ftp):\/\/\S+?\.[^'"><\s]+?)([.,]?(?:\s|$))/,%{#{@url_brace.xml_open}<a href="\\1" target="_top">\\1</a>#{@url_brace.xml_close}\\2}) #http ftp matches with decoration
           if (para =~/..\/\S+/ and para !~/(\"..\/\S+?\"|>\s*..\/\S+<)/)
             para.gsub!(/(\.\.\/\S+)/,'<a href="\1">\1</a>')
           end
diff --git a/lib/sisu/v0/odf.rb b/lib/sisu/v0/odf.rb
index cff57888..fbd4cc62 100644
--- a/lib/sisu/v0/odf.rb
+++ b/lib/sisu/v0/odf.rb
@@ -295,12 +295,12 @@ module SiSU_ODF
       end
       def normal(para)                                                           #P1 - P3
         para.gsub!(@serial,'')
-        para.gsub!(/(^|\s)[_\\]((?:https?|ftp):\/\/\S+?\.[^'"><\s]+?)([.,]?(?:\s|$))/,
-          %{\\1<text:a xlink:type="simple" xlink:href="\\2">\\2</text:a>\\3}) #http ftp matches escaped, no decoration
+        para.gsub!(/\b[_\\]((?:https?|ftp):\/\/\S+?\.[^'"><\s]+?)([.,]?(?:\s|$))/,
+          %{<text:a xlink:type="simple" xlink:href="\\1">\\1</text:a>\\2}) #http ftp matches escaped, no decoration
         para.gsub!(/((?:^|\s)[}])((?:https?|ftp):\/\/\S+?\.[^'"><\s]+?)([.,]?(?:\s|$))/,
           %{\\1<text:a xlink:type="simple" xlink:href="\\2">\\2</text:a>\\3}) #special case \{ e.g. \}http://url
-        para.gsub!(/(^|\s)((?:https?|ftp):\/\/\S+?\.[^'"><\s]+?)([.,]?(?:\s|$))/,
-          %{\\1#{@url_brace.xml_open}<text:a xlink:type="simple" xlink:href="\\2">\\2</text:a>#{@url_brace.xml_close}\\3}) #http ftp matches with decoration
+        para.gsub!(/\b((?:https?|ftp):\/\/\S+?\.[^'"><\s]+?)([.,]?(?:\s|$))/,
+          %{#{@url_brace.xml_open}<text:a xlink:type="simple" xlink:href="\\1">\\1</text:a>#{@url_brace.xml_close}\\2}) #http ftp matches with decoration
         para.gsub!(/([a-zA-Z0-9._-]+@[a-zA-Z0-9._-]+)/,
           %{#{@url_brace.xml_open}<text:a xlink:type="simple" xlink:href="mailto:\\1">\\1</text:a>#{@url_brace.xml_close}})
         para=case para
@@ -375,8 +375,8 @@ module SiSU_ODF
         parray=[]
         para.split(/<:?br(?: \/)?>/).each do |parablock|
           parablock=group_clean(parablock)
-          parablock.gsub!(/(^|\s)[_\\]((?:https?|ftp):\/\/\S+?\.[^'"><\s]+?)([.,]?(?:\s|$))/,
-            %{\\1<text:a xlink:type="simple" xlink:href="\\2">\\2</text:a>\\3}) #http ftp matches escaped, no decoration
+          parablock.gsub!(/\b[_\\]((?:https?|ftp):\/\/\S+?\.[^'"><\s]+?)([.,]?(?:\s|$))/,
+            %{<text:a xlink:type="simple" xlink:href="\\1">\\1</text:a>\\2}) #http ftp matches escaped, no decoration
           parray << %{<text:p text:style-name="P5">#{parablock}</text:p>} if parablock =~/\S+/
         end
         para=parray.join + '<text:p text:style-name="Standard"/>'
diff --git a/lib/sisu/v0/shared_html_lite.rb b/lib/sisu/v0/shared_html_lite.rb
index 2bcea532..16491ebf 100644
--- a/lib/sisu/v0/shared_html_lite.rb
+++ b/lib/sisu/v0/shared_html_lite.rb
@@ -131,9 +131,9 @@ module SiSU_Format_Shared
         words=word_mode.join(' ')
         para.gsub!(/.+/,words)
       end
-      para.gsub!(/(^|\s)[_\\]((?:https?|ftp):\/\/\S+?\.[^'"><\s]+?)([.,]?(?:\s|$))/,'\1<a href="\2" target="_top">\2</a>\3') #http ftp matches escaped, no decoration
+      para.gsub!(/\b[_\\]((?:https?|ftp):\/\/\S+?\.[^'"><\s]+?)([.,]?(?:\s|$))/,'<a href="\1" target="_top">\1</a>\2') #http ftp matches escaped, no decoration
       para.gsub!(/((?:^|\s)[}])((?:https?|ftp):\/\/\S+?\.[^'"><\s]+?)([.,]?(?:\s|$))/,'\1<a href="\2" target="_top">\2</a>\3') #special case \{ e.g. \}http://url
-      para.gsub!(/(^|\s)((?:https?|ftp):\/\/\S+?\.[^'"><\s]+?)([.,]?(?:\s|$))/,%{\\1#{@url_brace.xml_open}<a href="\\2" target="_top">\\2</a>#{@url_brace.xml_close}\\3}) #http ftp matches with decoration
+      para.gsub!(/\b((?:https?|ftp):\/\/\S+?\.[^'"><\s]+?)([.,]?(?:\s|$))/,%{#{@url_brace.xml_open}<a href="\\1" target="_top">\\1</a>#{@url_brace.xml_close}\\2}) #http ftp matches with decoration
       para
     end
     def paragraph
diff --git a/lib/sisu/v0/shared_xml.rb b/lib/sisu/v0/shared_xml.rb
index 995044db..249085a1 100644
--- a/lib/sisu/v0/shared_xml.rb
+++ b/lib/sisu/v0/shared_xml.rb
@@ -356,11 +356,11 @@ module SiSU_XML_munge
       #para.gsub!(/^_\*\s+/,'<image xmlns:xlink="http://www.w3.org/1999/xlink" xlink:type="simple" xlink:actuate="onLoad" xlink:show="embed" xlink:href="/usr/share/sisu/image/bullet_red.png" width="12" height="12" alt="*" /> ')
       para.gsub!(/(^|\s)\{\s*(\S+?\.(?:jpg|png|gif))\s+(\d+)x(\d+)(\s+[^}]+)?\}(https?:\/\/\S+)/,%{\\1<image xmlns:xlink="http://www.w3.org/1999/xlink" xlink:type="simple" xlink:actuate="onLoad" xlink:show="embed" xlink:href="#{@dir.url.images_local}/\\2" width="\\3" height="\\4" />[\\2] \\5})
       para.gsub!(/(^|\s)\{\s*(\S+?\.(?:jpg|png|gif))(\s+[^}]+)?\}(https?:\/\/\S+)/,%{\\1<image xmlns:xlink="http://www.w3.org/1999/xlink" xlink:type="simple" xlink:actuate="onLoad" xlink:show="embed" xlink:href="#{@dir.url.images_local}/\\2"/>\\2})
-      para.gsub!(/(^|\s)\{([^}]+)\}(https?:\/\/[^"><]+?)([,.:;"><]?(?:\s|$))/,
-        '\1<link xmlns:xlink="http://www.w3.org/1999/xlink" xlink:type="simple" xlink:href="\3">\2</link>\4') #watch, compare html_tune
-      para.gsub!(/(^|\s)((?:https?|ftp):\/\/\S+?\.[^'"><\s]+?)([.,]?(?:\s|$))/,
-        %{\\1#{@url_brace.xml_open}<link xmlns:xlink="http://www.w3.org/1999/xlink" xlink:type="simple" xlink:href="\\2">\\2</link>#{@url_brace.xml_close}\\3})
-      para.gsub!(/(^|\s)[_\\]((?:https?|ftp):\/\/\S+?\.[^'"><\s]+?)([.,]?(?:\s|$))/,'\1\2\3') #escaped urls not linked, deal with later
+      para.gsub!(/\B\{([^}]+)\}(https?:\/\/[^"><]+?)([,.:;"><]?(?:\s|$))/,
+        '<link xmlns:xlink="http://www.w3.org/1999/xlink" xlink:type="simple" xlink:href="\2">\1</link>\3') #watch, compare html_tune
+      para.gsub!(/\b((?:https?|ftp):\/\/\S+?\.[^'"><\s]+?)([.,]?(?:\s|$))/,
+        %{#{@url_brace.xml_open}<link xmlns:xlink="http://www.w3.org/1999/xlink" xlink:type="simple" xlink:href="\\1">\\1</link>#{@url_brace.xml_close}\\2})
+      para.gsub!(/\b[_\\]((?:https?|ftp):\/\/\S+?\.[^'"><\s]+?)([.,]?(?:\s|$))/,'<link xmlns:xlink="http://www.w3.org/1999/xlink" xlink:type="simple" xlink:href="\1">\1</link>\2') #escaped urls not linked, deal with later
       #para.gsub!(/(^|\s)[_\\]((?:https?|ftp):\/\/\S+?\.[^'"><\s]+?)([.,]?(?:\s|$))/,'\1<link xmlns:xlink="http://www.w3.org/1999/xlink" xlink:type="simple" xlink:href="\2">\2</link>\3') #escaped urls not linked, deal with later
       para.gsub!(/&nbsp;/,' ') #clean
       para
diff --git a/lib/sisu/v0/texpdf_format.rb b/lib/sisu/v0/texpdf_format.rb
index 4a8d2cb5..81646f23 100644
--- a/lib/sisu/v0/texpdf_format.rb
+++ b/lib/sisu/v0/texpdf_format.rb
@@ -501,8 +501,8 @@ WOK
       @string.gsub!(/<\/a>/,' ')
       @string.gsub!(/[^\}>_]((?:https?|ftp):\/\/\S+?)(<\/\S>)/,' \begin{scriptsize}\href{\1}{\1} \end{scriptsize}\2') #special case
       @string.gsub!(/((?:^|\s)[}])((?:https?|ftp):\/\/\S+?\.[^'"><\s]+?)([.,]?(?:\s|$))/,'\1\begin{scriptsize}\\href{\2}{\2}\end{scriptsize}\3') #special case \{ e.g. \}http://url
-      @string.gsub!(/(^|\s)(?:\\_|\\)((?:https?|ftp):\/\/\S+?\.[^'"><\s]+?)([.,]?(?:\s|$))/,'\1\begin{scriptsize}\\href{\2}{\2}\end{scriptsize}\3') #specially escaped url no decoration
-      @string.gsub!(/(^|\s)((?:https?|ftp):\/\/\S+?\.[^'"><\s]+?)([.,]?(?:\s|$))/,"\\1#{@url_brace.tex_open}\\begin{scriptsize}\\href{\\2}{\\2}\\end{scriptsize}#{@url_brace.tex_close}\\3") #url matching with decoration <url>
+      @string.gsub!(/\B(?:\\_|\\)((?:https?|ftp):\/\/\S+?\.[^'"><\s]+?)([.,]?(?:\s|$))/,'\begin{scriptsize}\\href{\1}{\1}\end{scriptsize}\2') #specially escaped url no decoration
+      @string.gsub!(/(^|\s)((?:https?|ftp):\/\/\S+?\.[^'"><\s]+?)([.,]?(?=\s|$))/,"\\1#{@url_brace.tex_open}\\begin{scriptsize}\\href{\\2}{\\2}\\end{scriptsize}#{@url_brace.tex_close}\\3") #url matching with decoration <url> positive lookahead, sequence issue with { linked }http://url cannot use \b at start
       @string.gsub!(/<:ee>/,'')
       @string.gsub!(/<!>/,' ')
       #proposed change, insert, but may be redundant
diff --git a/lib/sisu/v0/xml_md_oai_pmh_dc.rb b/lib/sisu/v0/xml_md_oai_pmh_dc.rb
index 7ac7c3a6..1d7008a1 100644
--- a/lib/sisu/v0/xml_md_oai_pmh_dc.rb
+++ b/lib/sisu/v0/xml_md_oai_pmh_dc.rb
@@ -182,6 +182,11 @@ WOK
   end
 end
 __END__
+http://www.openarchives.org/pmh/
+http://www.openarchives.org/OAI/2.0/openarchivesprotocol.htm#dublincore
+http://es.dublincore.org/documents/usageguide/elements.shtml
+http://dublincore.org/documents/dces/
+see also http://dublincore.org/documents/dcmes-xml/
 #http://www.openarchives.org/OAI/2.0/openarchivesprotocol.htm#dublincore
 #sample implementation, e.g. 2
 <?xml version="1.0" encoding="UTF-8"?>
-- 
cgit v1.2.3