--- /dev/null
+#!/usr/local/bin/fontforge
+
+a=FontsInFile($1)
+formats=Array(4);
+formats[0]='ttf';
+formats[1]='woff';
+formats[2]='svg';
+formats[3]='svgz';
+i=0
+while(i<SizeOf(a))
+
+ e=StrSplit(a[i],"+",2)
+ name=a[i]
+ if(SizeOf(e)==2)
+ name=e[1]
+ endif
+
+
+ Print("open font "+$1+"("+name+")")
+ Open($1+"("+name+")")
+ Print($fontname+">"+$nextfont)
+
+ j=0;
+
+ while(j<SizeOf(formats))
+ out=$2+"/"+name+"."+formats[j]
+ Print("write file "+out)
+ Generate(out)
+ j++
+ endloop
+ i++
+endloop
\ No newline at end of file
--- /dev/null
+#!/usr/bin/python
+# vim:ts=8:sw=4:expandtab:encoding=utf-8
+# Export named font from PDF file using fontforge and ctypes
+
+import sys
+from ctypes import *
+
+STRING = c_char_p
+real = c_float
+
+# We need the `map` attribute of SplineFont, so declear an incomplete struct.
+# see: http://sourceforge.net/projects/wqy/files/misc/
+# file: fontforge-bindctypes-0.1.tar.bz2
+class splinefont(Structure):
+ pass
+SplineFont = splinefont
+splinefont._fields_ = [
+ ('fontname', STRING),
+ ('fullname', STRING),
+ ('familyname', STRING),
+ ('weight', STRING),
+ ('copyright', STRING),
+ ('filename', STRING),
+ ('defbasefilename', STRING),
+ ('version', STRING),
+ ('italicangle', real),
+ ('upos', real),
+ ('uwidth', real),
+ ('ascent', c_int),
+ ('descent', c_int),
+ ('uniqueid', c_int),
+ ('glyphcnt', c_int),
+ ('glyphmax', c_int),
+ ('glyphs', POINTER(c_void_p)),
+ ('changed', c_uint, 1),
+ ('changed_since_autosave', c_uint, 1),
+ ('changed_since_xuidchanged', c_uint, 1),
+ ('display_antialias', c_uint, 1),
+ ('display_bbsized', c_uint, 1),
+ ('dotlesswarn', c_uint, 1),
+ ('onlybitmaps', c_uint, 1),
+ ('serifcheck', c_uint, 1),
+ ('issans', c_uint, 1),
+ ('isserif', c_uint, 1),
+ ('hasvmetrics', c_uint, 1),
+ ('loading_cid_map', c_uint, 1),
+ ('dupnamewarn', c_uint, 1),
+ ('encodingchanged', c_uint, 1),
+ ('multilayer', c_uint, 1),
+ ('strokedfont', c_uint, 1),
+ ('new', c_uint, 1),
+ ('compacted', c_uint, 1),
+ ('backedup', c_uint, 2),
+ ('use_typo_metrics', c_uint, 1),
+ ('weight_width_slope_only', c_uint, 1),
+ ('save_to_dir', c_uint, 1),
+ ('head_optimized_for_cleartype', c_uint, 1),
+ ('ticked', c_uint, 1),
+ ('internal_temp', c_uint, 1),
+ ('complained_about_spiros', c_uint, 1),
+ ('use_xuid', c_uint, 1),
+ ('use_uniqueid', c_uint, 1),
+ ('fv', c_void_p),
+ ('metrics', c_void_p),
+ ('uni_interp', c_int),
+ ('for_new_glyphs', c_void_p),
+ ('map', c_void_p),
+ # ...
+]
+
+def main():
+ if len(sys.argv) != 3:
+ print "Usage: %s doc.pdf fontname" % sys.argv[0]
+ sys.exit(2)
+ pdfname = sys.argv[1]
+ fontname = sys.argv[2]
+ fontfile = fontname + '.ttf'
+
+ # ctypes functions
+ libc = CDLL("libc.so.6")
+ libc.fopen.restype = c_void_p
+ libc.fopen.argtype = [c_char_p, c_char_p]
+
+ lib_ff = CDLL('libfontforge.so')
+
+ # SplineFont *_SFReadPdfFont(FILE *pdf,char *filename,
+ # char *select_this_font, enum openflags openflags)
+ lib_ff._SFReadPdfFont.argtypes = [c_void_p, c_char_p, c_char_p, c_int]
+ lib_ff._SFReadPdfFont.restype = POINTER(SplineFont)
+
+ # int GenerateScript(SplineFont *sf, char *filename, char *bitmaptype,
+ # int fmflags, int res, char *subfontdefinition, struct sflist *sfs,
+ # EncMap *map, NameList *rename_to,int layer)
+ lib_ff.GenerateScript.argytpes = [POINTER(SplineFont), c_char_p, c_char_p,
+ c_int, c_int, c_char_p, c_void_p, c_void_p, c_void_p, c_int]
+ lib_ff.GenerateScript.restype = c_int
+
+ # need to somehow initialize libfontforge or it will segfault somewhere.
+ lib_ff.doinitFontForgeMain()
+ fobj = libc.fopen(pdfname, "rb")
+ if not fobj:
+ print "%s not found" % pdfname
+ sys.exit(1)
+
+ font = lib_ff._SFReadPdfFont(fobj, pdfname, fontname, 0)
+ ret = 0
+ if bool(font):
+ ret = lib_ff.GenerateScript(font, fontfile, None, -1, -1, None, None,
+ font.contents.map, None, 1)
+ if ret:
+ print 'Font export to "%s".' % fontfile
+ else:
+ print "** Error ** Failed to export font!!"
+
+if __name__ == '__main__':
+ main()
+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\r
# See the License for the specific language governing permissions and\r
# limitations under the License.\r
-\r
-# This table is maps PDF stream operators to concrete OperatorProcessor\r
-# subclasses that are used by the PDFStreamEngine class to interpret the\r
-# PDF document. The classes configured here allow the PDFTextStripper\r
-# subclass of PDFStreamEngine to extract text content of the document.\r
-\r
-BT = org.apache.pdfbox.util.operator.BeginText\r
-cm = org.apache.pdfbox.util.operator.Concatenate\r
+#\r
+# this Table is a correspondance Map of the PDF stream operators with concretes class of the\r
+# OperatorProcessor abstract class for the stategy pattern used in the \r
+# org.apache.pdfbox.util.PDFStreamEngine class.\r
+# To change the behaviour of the system, remplace the class name by a new class name.\r
+b#=org.apache.pdfbox.util.operator.pagedrawer.CloseFillNonZeroAndStrokePath\r
+B#=org.apache.pdfbox.util.operator.pagedrawer.FillNonZeroAndStrokePath\r
+b*#=org.apache.pdfbox.util.operator.pagedrawer.CloseFillEvenOddAndStrokePath\r
+B*#=org.apache.pdfbox.util.operator.pagedrawer.FillEvenOddAndStrokePath\r
+#BDC org.apache.pdfbox.util.operator.NotImplemented ##Begin Marked Content -- section 10.5\r
+BI#=org.apache.pdfbox.util.operator.pagedrawer.BeginInlineImage\r
+#BMC org.apache.pdfbox.util.operator.NotImplemented ##Begin Marked Content -- section 10.5\r
+BT=org.apache.pdfbox.util.operator.BeginText\r
+#BX org.apache.pdfbox.util.operator.NotImplemented\r
+c#=org.apache.pdfbox.util.operator.pagedrawer.CurveTo\r
+cm=org.apache.pdfbox.util.operator.Concatenate\r
CS=org.apache.pdfbox.util.operator.SetStrokingColorSpace\r
cs=org.apache.pdfbox.util.operator.SetNonStrokingColorSpace\r
-Do = org.apache.pdfbox.util.operator.Invoke\r
-ET = org.apache.pdfbox.util.operator.EndText\r
-gs = org.apache.pdfbox.util.operator.SetGraphicsStateParameters\r
+d#=org.apache.pdfbox.util.operator.pagedrawer.SetLineDashPattern\r
+#d0 org.apache.pdfbox.util.operator.NotImplemented\r
+#d1 org.apache.pdfbox.util.operator.NotImplemented\r
+Do#=org.apache.pdfbox.util.operator.pagedrawer.Invoke\r
+#DP org.apache.pdfbox.util.operator.NotImplemented ##Marked Content Point-- section 10.5\r
+#El org.apache.pdfbox.util.operator.NotImplemented\r
+#EMC org.apache.pdfbox.util.operator.NotImplemented ##End Marked Content -- section 10.5\r
+ET=org.apache.pdfbox.util.operator.EndText\r
+#EX org.apache.pdfbox.util.operator.NotImplemented\r
+f#=org.apache.pdfbox.util.operator.pagedrawer.FillNonZeroRule\r
+F#=org.apache.pdfbox.util.operator.pagedrawer.FillNonZeroRule\r
+f*#=org.apache.pdfbox.util.operator.pagedrawer.FillEvenOddRule\r
G=org.apache.pdfbox.util.operator.SetStrokingGrayColor\r
g=org.apache.pdfbox.util.operator.SetNonStrokingGrayColor\r
-q = org.apache.pdfbox.util.operator.GSave\r
-Q = org.apache.pdfbox.util.operator.GRestore\r
+gs=org.apache.pdfbox.util.operator.SetGraphicsStateParameters\r
+h#=org.apache.pdfbox.util.operator.pagedrawer.ClosePath\r
+#i org.apache.pdfbox.util.operator.NotImplemented\r
+#ID org.apache.pdfbox.util.operator.NotImplemented\r
+j#=org.apache.pdfbox.util.operator.pagedrawer.SetLineJoinStyle\r
+J#=org.apache.pdfbox.util.operator.pagedrawer.SetLineCapStyle\r
K=org.apache.pdfbox.util.operator.SetStrokingCMYKColor\r
k=org.apache.pdfbox.util.operator.SetNonStrokingCMYKColor\r
+l#=org.apache.pdfbox.util.operator.pagedrawer.LineTo\r
+m#=org.apache.pdfbox.util.operator.pagedrawer.MoveTo\r
+M#=org.apache.pdfbox.util.operator.pagedrawer.SetLineMiterLimit\r
+#MP org.apache.pdfbox.util.operator.NotImplemented ##Marked Content Point-- section 10.5\r
+n#=org.apache.pdfbox.util.operator.pagedrawer.EndPath\r
+q=org.apache.pdfbox.util.operator.GSave\r
+Q=org.apache.pdfbox.util.operator.GRestore\r
+re#=org.apache.pdfbox.util.operator.pagedrawer.AppendRectangleToPath\r
RG=org.apache.pdfbox.util.operator.SetStrokingRGBColor\r
rg=org.apache.pdfbox.util.operator.SetNonStrokingRGBColor\r
+#ri org.apache.pdfbox.util.operator.NotImplemented\r
+s=org.apache.pdfbox.util.operator.CloseAndStrokePath\r
+S#=org.apache.pdfbox.util.operator.pagedrawer.StrokePath\r
SC=org.apache.pdfbox.util.operator.SetStrokingColor\r
sc=org.apache.pdfbox.util.operator.SetNonStrokingColor\r
SCN=org.apache.pdfbox.util.operator.SetStrokingColor\r
scn=org.apache.pdfbox.util.operator.SetNonStrokingColor\r
-T* = org.apache.pdfbox.util.operator.NextLine\r
-Tc = org.apache.pdfbox.util.operator.SetCharSpacing\r
-Td = org.apache.pdfbox.util.operator.MoveText\r
-TD = org.apache.pdfbox.util.operator.MoveTextSetLeading\r
-Tf = org.apache.pdfbox.util.operator.SetTextFont\r
-Tj = org.apache.pdfbox.util.operator.ShowText\r
-TJ = org.apache.pdfbox.util.operator.ShowTextGlyph\r
-TL = org.apache.pdfbox.util.operator.SetTextLeading\r
-Tm = org.apache.pdfbox.util.operator.SetMatrix\r
-Tr = org.apache.pdfbox.util.operator.SetTextRenderingMode\r
-Ts = org.apache.pdfbox.util.operator.SetTextRise\r
-Tw = org.apache.pdfbox.util.operator.SetWordSpacing\r
-Tz = org.apache.pdfbox.util.operator.SetHorizontalTextScaling\r
-w = org.apache.pdfbox.util.operator.SetLineWidth\r
-\' = org.apache.pdfbox.util.operator.MoveAndShow\r
-\" = org.apache.pdfbox.util.operator.SetMoveAndShow\r
-\r
-# The following operators are not relevant to text extraction,\r
-# so we can silently ignore them.\r
-\r
-b\r
-B\r
-b*\r
-B*\r
-BDC\r
-BI\r
-BMC\r
-BX\r
-c\r
-d\r
-d0\r
-d1\r
-DP\r
-El\r
-EMC\r
-EX\r
-f\r
-F\r
-f*\r
-h\r
-i\r
-ID\r
-j\r
-J\r
-l\r
-m\r
-M\r
-MP\r
-n\r
-re\r
-ri\r
-s\r
-S\r
-sh\r
-v\r
-W\r
-W*\r
-y\r
+sh#=org.apache.pdfbox.util.operator.pagedrawer.SHFill\r
+T*=org.apache.pdfbox.util.operator.NextLine\r
+Tc=org.apache.pdfbox.util.operator.SetCharSpacing\r
+Td=org.apache.pdfbox.util.operator.MoveText\r
+TD=org.apache.pdfbox.util.operator.MoveTextSetLeading\r
+Tf=org.apache.pdfbox.util.operator.SetTextFont\r
+Tj=org.apache.pdfbox.util.operator.ShowText\r
+TJ=org.apache.pdfbox.util.operator.ShowTextGlyph\r
+TL=org.apache.pdfbox.util.operator.SetTextLeading\r
+Tm=org.apache.pdfbox.util.operator.SetMatrix\r
+Tr=org.apache.pdfbox.util.operator.SetTextRenderingMode\r
+Ts=org.apache.pdfbox.util.operator.SetTextRise\r
+Tw=org.apache.pdfbox.util.operator.SetWordSpacing\r
+Tz=org.apache.pdfbox.util.operator.SetHorizontalTextScaling\r
+v#=org.apache.pdfbox.util.operator.pagedrawer.CurveToReplicateInitialPoint\r
+w#=org.apache.pdfbox.util.operator.pagedrawer.SetLineWidth\r
+W# org.apache.pdfbox.util.operator.pagedrawer.ClipNonZeroRule\r
+W*# org.apache.pdfbox.util.operator.pagedrawer.ClipEvenOddRule\r
+y#=org.apache.pdfbox.util.operator.pagedrawer.CurveToReplicateFinalPoint\r
+\'=org.apache.pdfbox.util.operator.MoveAndShow\r
+\"=org.apache.pdfbox.util.operator.SetMoveAndShow\r
$gs->setArg('-dNOPROMPT');\r
$gs->setArg('-dNODISPLAY');\r
$gs->setArg(null, WS_TOOLS . '/extractFonts.ps');\r
- $gs->setManualArg('-c "(' . $this->cropped . ') extractFonts quit"');\r
+ $gs->setManualArg('-c "(' . $this->in . ') extractFonts quit"');\r
\r
$gs->execute();\r
$this->addToLog($gs);\r
\r
$imagesize = getimagesize(WS_DOCS . '/' . $this->pages[1]['document_id'] . '/html/h150-' . $this->pages[1]['document_page'] . '.jpg');\r
$this->pdf2htmlRatio = $imagesize[0] / $this->layouts[1]['width'];\r
- $this->scale = 20;\r
+ $this->scale = 5;\r
$this->multiply = $this->pdf2htmlRatio * $this->scale;\r
\r
$this->createHTML();\r
foreach ($this->layouts as $page => $layout) {\r
$this->div[$page] = array();\r
$document_id = $this->pages[$page]['document_id'];\r
- foreach ($layout->xpath('//a') as $div) {\r
- $this->div[$page][] = $this->addCSSText($div, $document_id);\r
+ foreach ($layout->l as $line) {\r
+ $this->div[$page][] = $this->addLine($line, $document_id);\r
}\r
}\r
+\r
mkdir($this->vdir . '/style', 0777, true);\r
mkdir($this->vdir . '/contents', 0777, true);\r
mkdir($this->vdir . '/images', 0777, true);\r
\r
protected function writePage($page) {\r
$res = '';\r
- foreach ($page as $div) {\r
- $res .= $this->writeSpan($div);\r
+ foreach ($page as $line) {\r
+ $res .= $this->writeLine($line);\r
}\r
return $res;\r
}\r
\r
- protected function writeSpan($span) {\r
- if ($span === false) {\r
- return '';\r
+ protected function writeLine($line) {\r
+ $class = array('l');\r
+ if (!is_null($line['rotation'])) {\r
+ $class[] = 'r' . $line['rotation'];\r
}\r
+ $class = implode(' ', $class);\r
\r
- $top = round($span['y'] * $this->multiply, 2);\r
- $left = round($span['x'] * $this->multiply, 2);\r
+ $left = round($line['x'] * $this->multiply, 2);\r
+ $top = round($line['y'] * $this->multiply, 2);\r
\r
- $class = '';\r
- if (!is_null($span['color'])) {\r
- $class.=' c' . $span['color'];\r
+ $res = '<div style="top:' . $top . 'px;left:' . $left . 'px;" class="' . $class . '">';\r
+ foreach ($line['groups'] as $group) {\r
+ $res.=$this->writeGroup($group);\r
}\r
- if (!is_null($span['size'])) {\r
- $class.=' s' . $span['size'];\r
+ $res.='</div>';\r
+ return $res;\r
+ }\r
+\r
+ protected function writeGroup($group) {\r
+ if ($group === false) {\r
+ return '';\r
}\r
- if (!is_null($span['font'])) {\r
- $class.=' f' . $span['font'];\r
+\r
+ $class = array('g');\r
+ if (!is_null($group['color'])) {\r
+ $class[] = 'c' . $group['color'];\r
}\r
- if (!is_null($span['wordSpacing'])) {\r
- $class.=' w' . $span['wordSpacing'];\r
+ if (!is_null($group['size'])) {\r
+ $class[] = 's' . $group['size'];\r
}\r
- if (!is_null($span['letterSpacing'])) {\r
- $class.=' l' . $span['letterSpacing'];\r
+ if (!is_null($group['font'])) {\r
+ $class[] = 'f' . $group['font'];\r
}\r
- if (!is_null($span['rotation'])) {\r
- $class.=' r' . $span['rotation'];\r
+ if (!is_null($group['wordSpacing'])) {\r
+ $class[] = 'w' . $group['wordSpacing'];\r
}\r
+ if (!is_null($group['letterSpacing'])) {\r
+ $class[] = 'l' . $group['letterSpacing'];\r
+ }\r
+ $class = implode(' ', $class);\r
+\r
+ $top = round($group['x'] * $this->multiply, 2);\r
\r
- $class = trim($class);\r
+ $res = '<div class="' . $class . '" style="top:' . $top . 'px;">';\r
+ foreach ($group['spans'] as $span) {\r
+ $res.=$this->writeSpan($span);\r
+ }\r
+ $res.='</div>';\r
+ return $res;\r
+ }\r
+\r
+ protected function writeSpan($span) {\r
+ if ($span === false) {\r
+ return '';\r
+ }\r
+ $left = round($span['x'] * $this->multiply, 2);\r
\r
- $res = '<div ';\r
- $res .= 'style="left:' . $left . 'px;top:' . $top . 'px" ';\r
- $res .= 'class="' . $class . '"><span class="r">';\r
- $res .= htmlentities($span['text'], ENT_NOQUOTES, 'UTF-8');\r
- $res .= '</span></div>';\r
+ $res = '<span style="left:' . $left . 'px;">';\r
+ $res .= str_replace(' ', ' ', $span['text']);\r
+ $res .= '</span>';\r
return $res;\r
}\r
\r
\r
foreach ($this->cssSize as $size => $index) {\r
$size*=$this->multiply;\r
- // Point to pixel conversion\r
$res[] = '.s' . $index . '{font-size:' . $size . 'px}';\r
}\r
\r
\r
foreach ($this->cssRotation as $rotation => $index) {\r
$rotation*= - 1;\r
- $res[] = '.r' . $index . '{-webkit-transform: rotate(' . $rotation . 'deg);-moz-transform: rotate(' . $rotation . 'deg)}';\r
+\r
+ $navigators = array('-moz-', '-webkit-', '-ms-', '-o-', '');\r
+ $t = 'transform:rotate(' . $rotation . 'deg);';\r
+ $to = 'transform-origin:left top;';\r
+\r
+ $css = '.r' . $index . '{';\r
+ foreach ($navigators as $n) {\r
+ $css.=$n . $t . $n . $to;\r
+ }\r
+ $css.='}';\r
+ $res[] = $css;\r
}\r
\r
foreach ($this->cssFont as $font => $index) {\r
return implode("\n", $res);\r
}\r
\r
- protected function addCSSText($l, $document_id) {\r
- $alpha = intval(substr($l['color'], 1, 2), 16);\r
- $text = (string) $l;\r
- if ($text == '') {\r
- return false;\r
+ protected function addLine($line, $document_id) {\r
+ $res = array();\r
+ foreach ($line->a as $group) {\r
+ $res[] = $this->addGroup($group, $document_id);\r
}\r
+ return array('x' => $this->normalizeFloatValue($line['x']),\r
+ 'y' => $this->normalizeFloatValue($line['y']),\r
+ 'rotation' => $this->getCSSRotation($this->normalizeFloatValue($line['rotation'], 0)),\r
+ "groups" => $res);\r
+ }\r
+\r
+ protected function addGroup($group, $document_id) {\r
+ $alpha = intval(substr($group['color'], 1, 2), 16);\r
if ($alpha == 0) {\r
return false;\r
}\r
+\r
+ $res = array();\r
+ foreach ($group->s as $span) {\r
+ $res[] = $this->addSpan($span, $document_id);\r
+ }\r
+ return array(\r
+ 'color' => $this->getCSSColor($group['color']),\r
+ 'size' => $this->getCSSSize($group['size']),\r
+ 'font' => $this->getCSSFont($group['font'], $document_id),\r
+ 'letterSpacing' => $this->getCSSLetterSpacing($group['letterspacing']),\r
+ 'wordSpacing' => $this->getCSSWordSpacing($group['wordspacing']),\r
+ 'x' => $group['size']/-1.2,\r
+ 'spans' => $res);\r
+ }\r
+\r
+ protected function addSpan($span, $document_id) {\r
+\r
+ $text = (string) $span;\r
+ if ($text == '') {\r
+ return false;\r
+ }\r
return array('text' => $text,\r
- 'color' => $this->getCSSColor($l['color']),\r
- 'size' => $this->getCSSSize($l['size']),\r
- 'font' => $this->getCSSFont($l['font'], $document_id),\r
- 'letterSpacing' => $this->getCSSLetterSpacing($l['letterspacing']),\r
- 'wordSpacing' => $this->getCSSWordSpacing($l['wordspacing']),\r
- 'rotation' => $this->getCSSRotation($l['rotation']),\r
- 'x' => $this->normalizeFloatValue($l['x']),\r
- 'y' => $this->normalizeFloatValue($l['y']));\r
+ 'x' => $this->normalizeFloatValue($span['x']));\r
}\r
\r
- protected function getCSSSize($size) {\r
+ protected function getCSSSize(&$size) {\r
+ $size/=1.05;\r
$size = $this->normalizeFloatValue($size);\r
return $this->getIndex($size, $this->cssSize);\r
}\r
}\r
\r
protected function getCSSLetterSpacing($letterspacing) {\r
- $letterspacing = $this->normalizeFloatValue($letterspacing);\r
+ $letterspacing = $this->normalizeFloatValue($letterspacing, 2);\r
if ($letterspacing == 0) {\r
return null;\r
}\r
}\r
\r
protected function getCSSWordSpacing($wordspacing) {\r
- $wordspacing = $this->normalizeFloatValue($wordspacing);\r
+ $wordspacing = $this->normalizeFloatValue($wordspacing, 2);\r
if ($wordspacing == 0) {\r
return null;\r
}\r
}\r
\r
protected function getCSSRotation($rotation) {\r
- $rotation = round($rotation);\r
+ $rotation = $this->normalizeFloatValue($rotation, 0);\r
if ($rotation == 0) {\r
return null;\r
}\r
return $res;\r
}\r
\r
- protected function normalizeFloatValue($value) {\r
+ protected function normalizeFloatValue($value, $round=3) {\r
$value = str_replace(',', '.', $value);\r
$value = (float) $value;\r
- $value = round($value, 3);\r
+ $value = round($value, $round);\r
return $value;\r
}\r
\r