]> _ Git - cubist_pdf.git/commitdiff
wip #7967 @0.75
authorVincent Vanwaelscappel <vincent@cubedesigners.com>
Thu, 29 Jan 2026 10:55:48 +0000 (11:55 +0100)
committerVincent Vanwaelscappel <vincent@cubedesigners.com>
Thu, 29 Jan 2026 10:55:48 +0000 (11:55 +0100)
resources/tools/fwstk/.idea/workspace.xml
resources/tools/fwstk/bin/com/fluidbook/fwstk/Main.class
resources/tools/fwstk/bin/com/fluidbook/fwstk/layout/LayoutStripper.class
resources/tools/fwstk/bin/org/apache/pdfbox/resources/LayoutStripper.properties
resources/tools/fwstk/out/artifacts/fwstk_jar/fwstk.jar
resources/tools/fwstk/out/artifacts/fwstk_jar/org/apache/pdfbox/resources/LayoutStripper.properties
resources/tools/fwstk/project_resources/org/apache/pdfbox/resources/LayoutStripper.properties
resources/tools/fwstk/src/com/fluidbook/fwstk/Main.java
resources/tools/fwstk/src/com/fluidbook/fwstk/layout/LayoutStripper.java

index 069fa80f70e003cf6604640582bdc3bf9b36b107..b9018713c8ca79f53aab6a5526aded61ce2d8af8 100644 (file)
@@ -82,7 +82,7 @@
   <component name="MarkdownSettingsMigration">
     <option name="stateVersion" value="1" />
   </component>
-  <component name="PhpWorkspaceProjectConfiguration" interpreter_name="PHP 8.2" />
+  <component name="PhpWorkspaceProjectConfiguration" interpreter_name="PHP 8.5" />
   <component name="ProjectColorInfo">{
   &quot;associatedIndex&quot;: 8
 }</component>
     <option name="hideEmptyMiddlePackages" value="true" />
     <option name="showLibraryContents" value="true" />
   </component>
-  <component name="PropertiesComponent">{
-  &quot;keyToString&quot;: {
-    &quot;Application.extract links.executor&quot;: &quot;Run&quot;,
-    &quot;RunOnceActivity.OpenProjectViewOnStart&quot;: &quot;true&quot;,
-    &quot;RunOnceActivity.ShowReadmeOnStart&quot;: &quot;true&quot;,
-    &quot;RunOnceActivity.git.unshallow&quot;: &quot;true&quot;,
-    &quot;WebServerToolWindowFactoryState&quot;: &quot;true&quot;,
-    &quot;WebServerToolWindowPanel.toolwindow.highlight.mappings&quot;: &quot;true&quot;,
-    &quot;WebServerToolWindowPanel.toolwindow.highlight.symlinks&quot;: &quot;true&quot;,
-    &quot;WebServerToolWindowPanel.toolwindow.show.date&quot;: &quot;false&quot;,
-    &quot;WebServerToolWindowPanel.toolwindow.show.permissions&quot;: &quot;false&quot;,
-    &quot;WebServerToolWindowPanel.toolwindow.show.size&quot;: &quot;false&quot;,
-    &quot;git-widget-placeholder&quot;: &quot;master&quot;,
-    &quot;ignore.virus.scanning.warn.message&quot;: &quot;true&quot;,
-    &quot;junie.onboarding.icon.badge.shown&quot;: &quot;true&quot;,
-    &quot;kotlin-language-version-configured&quot;: &quot;true&quot;,
-    &quot;last_opened_file_path&quot;: &quot;D:/Works/cubist_pdf/resources/tools/fwstk&quot;,
-    &quot;node.js.detected.package.eslint&quot;: &quot;true&quot;,
-    &quot;node.js.detected.package.tslint&quot;: &quot;true&quot;,
-    &quot;node.js.selected.package.eslint&quot;: &quot;(autodetect)&quot;,
-    &quot;node.js.selected.package.tslint&quot;: &quot;(autodetect)&quot;,
-    &quot;nodejs_package_manager_path&quot;: &quot;npm&quot;,
-    &quot;project.structure.last.edited&quot;: &quot;Libraries&quot;,
-    &quot;project.structure.proportion&quot;: &quot;0.15&quot;,
-    &quot;project.structure.side.proportion&quot;: &quot;0.2&quot;,
-    &quot;ruby.rails.projectView.checked&quot;: &quot;true&quot;,
-    &quot;settings.editor.selected.configurable&quot;: &quot;preferences.lookFeel&quot;,
-    &quot;vue.rearranger.settings.migration&quot;: &quot;true&quot;
+  <component name="PropertiesComponent"><![CDATA[{
+  "keyToString": {
+    "Application.extract links.executor": "Run",
+    "Application.extract texts.executor": "Run",
+    "RunOnceActivity.OpenProjectViewOnStart": "true",
+    "RunOnceActivity.ShowReadmeOnStart": "true",
+    "RunOnceActivity.git.unshallow": "true",
+    "RunOnceActivity.typescript.service.memoryLimit.init": "true",
+    "WebServerToolWindowFactoryState": "true",
+    "WebServerToolWindowPanel.toolwindow.highlight.mappings": "true",
+    "WebServerToolWindowPanel.toolwindow.highlight.symlinks": "true",
+    "WebServerToolWindowPanel.toolwindow.show.date": "false",
+    "WebServerToolWindowPanel.toolwindow.show.permissions": "false",
+    "WebServerToolWindowPanel.toolwindow.show.size": "false",
+    "git-widget-placeholder": "master",
+    "ignore.virus.scanning.warn.message": "true",
+    "junie.onboarding.icon.badge.shown": "true",
+    "kotlin-language-version-configured": "true",
+    "last_opened_file_path": "D:/Works/cubist_pdf/resources/tools/fwstk",
+    "node.js.detected.package.eslint": "true",
+    "node.js.detected.package.tslint": "true",
+    "node.js.selected.package.eslint": "(autodetect)",
+    "node.js.selected.package.tslint": "(autodetect)",
+    "nodejs_package_manager_path": "npm",
+    "project.structure.last.edited": "Libraries",
+    "project.structure.proportion": "0.15",
+    "project.structure.side.proportion": "0.2",
+    "ruby.rails.projectView.checked": "true",
+    "settings.editor.selected.configurable": "preferences.lookFeel",
+    "vue.rearranger.settings.migration": "true"
   }
-}</component>
+}]]></component>
   <component name="RecentsManager">
     <key name="CopyFile.RECENT_KEYS">
       <recent name="H:\Works\cubeExtranet\fluidbook\tools\fwstk\lib" />
       <recent name="H:\Works\cubeExtranet\fluidbook\tools\fwstk" />
     </key>
   </component>
-  <component name="RunManager" selected="Application.extract links">
+  <component name="RunManager" selected="Application.extract texts">
     <configuration default="true" type="AndroidRunConfigurationType" factoryName="Android App">
       <option name="DEPLOY" value="true" />
       <option name="DEPLOY_APK_FROM_BUNDLE" value="false" />
       <option name="USE_PATTERN" value="false" />
       <method />
     </configuration>
-    <configuration default="true" type="tests" factoryName="Nosetests">
-      <module name="fwstk" />
-      <option name="INTERPRETER_OPTIONS" value="" />
-      <option name="PARENT_ENVS" value="true" />
-      <option name="SDK_HOME" value="" />
-      <option name="WORKING_DIRECTORY" value="" />
-      <option name="IS_MODULE_SDK" value="false" />
-      <option name="ADD_CONTENT_ROOTS" value="true" />
-      <option name="ADD_SOURCE_ROOTS" value="true" />
-      <EXTENSION ID="PythonCoverageRunConfigurationExtension" runner="coverage.py" />
-      <option name="_new_regexPattern" value="&quot;&quot;" />
-      <option name="_new_additionalArguments" value="&quot;&quot;" />
-      <option name="_new_target" value="&quot;.&quot;" />
-      <option name="_new_targetType" value="&quot;PATH&quot;" />
-      <method v="2" />
-    </configuration>
-    <configuration default="true" type="tests" factoryName="Unittests">
-      <module name="fwstk" />
-      <option name="INTERPRETER_OPTIONS" value="" />
-      <option name="PARENT_ENVS" value="true" />
-      <option name="SDK_HOME" value="" />
-      <option name="WORKING_DIRECTORY" value="" />
-      <option name="IS_MODULE_SDK" value="false" />
-      <option name="ADD_CONTENT_ROOTS" value="true" />
-      <option name="ADD_SOURCE_ROOTS" value="true" />
-      <EXTENSION ID="PythonCoverageRunConfigurationExtension" runner="coverage.py" />
-      <option name="_new_additionalArguments" value="&quot;&quot;" />
-      <option name="_new_target" value="&quot;.&quot;" />
-      <option name="_new_targetType" value="&quot;PATH&quot;" />
-      <method v="2" />
-    </configuration>
-    <configuration default="true" type="tests" factoryName="py.test">
-      <module name="fwstk" />
-      <option name="INTERPRETER_OPTIONS" value="" />
-      <option name="PARENT_ENVS" value="true" />
-      <option name="SDK_HOME" value="" />
-      <option name="WORKING_DIRECTORY" value="" />
-      <option name="IS_MODULE_SDK" value="false" />
-      <option name="ADD_CONTENT_ROOTS" value="true" />
-      <option name="ADD_SOURCE_ROOTS" value="true" />
-      <EXTENSION ID="PythonCoverageRunConfigurationExtension" runner="coverage.py" />
-      <option name="_new_keywords" value="&quot;&quot;" />
-      <option name="_new_parameters" value="&quot;&quot;" />
-      <option name="_new_additionalArguments" value="&quot;&quot;" />
-      <option name="_new_target" value="&quot;.&quot;" />
-      <option name="_new_targetType" value="&quot;PATH&quot;" />
-      <method v="2" />
-    </configuration>
     <configuration name="extract layout" type="Application" factoryName="Application">
       <option name="MAIN_CLASS_NAME" value="com.fluidbook.fwstk.Main" />
       <module name="fwstk" />
     <configuration name="extract texts" type="Application" factoryName="Application">
       <option name="MAIN_CLASS_NAME" value="com.fluidbook.fwstk.Main" />
       <module name="fwstk" />
-      <option name="PROGRAM_PARAMETERS" value="--input C:\Users\vince\Desktop\20929.pdf --mode robust --extractTextsMethod fluidbook --extractTexts C:\Users\vince\Desktop\20929\%s%d.txt --threads 1" />
+      <option name="PROGRAM_PARAMETERS" value="--input C:\Users\vince\Downloads\fluidbook_33884.pdf --extractTextsMethod fluidbook --extractTexts C:\Users\vince\Desktop\33884\%s%d.txt --threads 1" />
       <method v="2">
         <option name="Make" enabled="true" />
       </method>
       <envs />
       <method v="2" />
     </configuration>
+    <configuration default="true" type="tests" factoryName="Nosetests">
+      <module name="fwstk" />
+      <option name="ENV_FILES" value="" />
+      <option name="INTERPRETER_OPTIONS" value="" />
+      <option name="PARENT_ENVS" value="true" />
+      <option name="SDK_HOME" value="" />
+      <option name="WORKING_DIRECTORY" value="" />
+      <option name="IS_MODULE_SDK" value="false" />
+      <option name="ADD_CONTENT_ROOTS" value="true" />
+      <option name="ADD_SOURCE_ROOTS" value="true" />
+      <EXTENSION ID="PythonCoverageRunConfigurationExtension" runner="coverage.py" />
+      <option name="RUN_TOOL" value="" />
+      <option name="_new_regexPattern" value="&quot;&quot;" />
+      <option name="_new_additionalArguments" value="&quot;&quot;" />
+      <option name="_new_target" value="&quot;.&quot;" />
+      <option name="_new_targetType" value="&quot;PATH&quot;" />
+      <method v="2" />
+    </configuration>
+    <configuration default="true" type="tests" factoryName="Unittests">
+      <module name="fwstk" />
+      <option name="ENV_FILES" value="" />
+      <option name="INTERPRETER_OPTIONS" value="" />
+      <option name="PARENT_ENVS" value="true" />
+      <option name="SDK_HOME" value="" />
+      <option name="WORKING_DIRECTORY" value="" />
+      <option name="IS_MODULE_SDK" value="false" />
+      <option name="ADD_CONTENT_ROOTS" value="true" />
+      <option name="ADD_SOURCE_ROOTS" value="true" />
+      <EXTENSION ID="PythonCoverageRunConfigurationExtension" runner="coverage.py" />
+      <option name="RUN_TOOL" value="" />
+      <option name="_new_additionalArguments" value="&quot;&quot;" />
+      <option name="_new_target" value="&quot;.&quot;" />
+      <option name="_new_targetType" value="&quot;PATH&quot;" />
+      <method v="2" />
+    </configuration>
+    <configuration default="true" type="tests" factoryName="py.test">
+      <module name="fwstk" />
+      <option name="ENV_FILES" value="" />
+      <option name="INTERPRETER_OPTIONS" value="" />
+      <option name="PARENT_ENVS" value="true" />
+      <option name="SDK_HOME" value="" />
+      <option name="WORKING_DIRECTORY" value="" />
+      <option name="IS_MODULE_SDK" value="false" />
+      <option name="ADD_CONTENT_ROOTS" value="true" />
+      <option name="ADD_SOURCE_ROOTS" value="true" />
+      <EXTENSION ID="PythonCoverageRunConfigurationExtension" runner="coverage.py" />
+      <option name="RUN_TOOL" value="" />
+      <option name="_new_keywords" value="&quot;&quot;" />
+      <option name="_new_parameters" value="&quot;&quot;" />
+      <option name="_new_additionalArguments" value="&quot;&quot;" />
+      <option name="_new_target" value="&quot;.&quot;" />
+      <option name="_new_targetType" value="&quot;PATH&quot;" />
+      <method v="2" />
+    </configuration>
     <list>
       <item itemvalue="Application.extract layout" />
       <item itemvalue="Application.extract links" />
   <component name="SharedIndexes">
     <attachedChunks>
       <set>
-        <option value="bundled-jdk-9823dce3aa75-fbdcb00ec9e3-intellij.indexing.shared.core-IU-251.25410.129" />
-        <option value="bundled-js-predefined-d6986cc7102b-6a121458b545-JavaScript-IU-251.25410.129" />
+        <option value="bundled-jdk-30f59d01ecdd-2fc7cc6b9a17-intellij.indexing.shared.core-IU-253.30387.90" />
+        <option value="bundled-js-predefined-d6986cc7102b-9b0f141eb926-JavaScript-IU-253.30387.90" />
       </set>
     </attachedChunks>
   </component>
       <workItem from="1748351552423" duration="1932000" />
       <workItem from="1748355409566" duration="1244000" />
       <workItem from="1748356736199" duration="3367000" />
+      <workItem from="1769008557586" duration="495000" />
+      <workItem from="1769682693343" duration="1080000" />
     </task>
     <task id="LOCAL-00001" summary="wip #1111 @0.5">
       <created>1487172253077</created>
index bdf1aa8a51478cc37a837e1ed8ca40568db85d2a..3a364a1b681e358a2a1465527725273f144ae9d7 100644 (file)
Binary files a/resources/tools/fwstk/bin/com/fluidbook/fwstk/Main.class and b/resources/tools/fwstk/bin/com/fluidbook/fwstk/Main.class differ
index 7cb76994710f475f1e3c5571a85e3d3bbbc4b174..2fdde95ba2d36b2cef7a7810c4ba5af0fca53967 100644 (file)
Binary files a/resources/tools/fwstk/bin/com/fluidbook/fwstk/layout/LayoutStripper.class and b/resources/tools/fwstk/bin/com/fluidbook/fwstk/layout/LayoutStripper.class differ
index 6938011f570cd63ffe097d2480b42f8d333590a1..13a585079c6f431e132c6fb552cbe3efd9e2a54d 100644 (file)
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-#
-# this Table is a correspondance Map of the PDF stream operators with concretes class of the
-# OperatorProcessor abstract class for the stategy pattern used in the 
-# org.apache.pdfbox.util.PDFStreamEngine class.
-# To change the behaviour of the system, remplace the class name by a new class name.
-b#=org.apache.pdfbox.util.operator.pagedrawer.CloseFillNonZeroAndStrokePath
-B#=org.apache.pdfbox.util.operator.pagedrawer.FillNonZeroAndStrokePath
-b*#=org.apache.pdfbox.util.operator.pagedrawer.CloseFillEvenOddAndStrokePath
-B*#=org.apache.pdfbox.util.operator.pagedrawer.FillEvenOddAndStrokePath
-#BDC org.apache.pdfbox.util.operator.NotImplemented ##Begin Marked Content -- section 10.5
-BI#=org.apache.pdfbox.util.operator.pagedrawer.BeginInlineImage
-#BMC org.apache.pdfbox.util.operator.NotImplemented ##Begin Marked Content -- section 10.5
-BT=org.apache.pdfbox.util.operator.BeginText
-#BX org.apache.pdfbox.util.operator.NotImplemented
-c#=org.apache.pdfbox.util.operator.pagedrawer.CurveTo
-cm=org.apache.pdfbox.util.operator.Concatenate
-CS=org.apache.pdfbox.util.operator.SetStrokingColorSpace
-cs=org.apache.pdfbox.util.operator.SetNonStrokingColorSpace
-d#=org.apache.pdfbox.util.operator.pagedrawer.SetLineDashPattern
-#d0 org.apache.pdfbox.util.operator.NotImplemented
-#d1 org.apache.pdfbox.util.operator.NotImplemented
-Do#=org.apache.pdfbox.util.operator.pagedrawer.Invoke
-#DP org.apache.pdfbox.util.operator.NotImplemented ##Marked Content Point-- section 10.5
-#El org.apache.pdfbox.util.operator.NotImplemented
-#EMC org.apache.pdfbox.util.operator.NotImplemented ##End Marked Content -- section 10.5
-ET=org.apache.pdfbox.util.operator.EndText
-#EX org.apache.pdfbox.util.operator.NotImplemented
-f#=org.apache.pdfbox.util.operator.pagedrawer.FillNonZeroRule
-F#=org.apache.pdfbox.util.operator.pagedrawer.FillNonZeroRule
-f*#=org.apache.pdfbox.util.operator.pagedrawer.FillEvenOddRule
-G=org.apache.pdfbox.util.operator.SetStrokingGrayColor
-g=org.apache.pdfbox.util.operator.SetNonStrokingGrayColor
-gs=org.apache.pdfbox.util.operator.SetGraphicsStateParameters
-h#=org.apache.pdfbox.util.operator.pagedrawer.ClosePath
-#i org.apache.pdfbox.util.operator.NotImplemented
-#ID org.apache.pdfbox.util.operator.NotImplemented
-j#=org.apache.pdfbox.util.operator.pagedrawer.SetLineJoinStyle
-J#=org.apache.pdfbox.util.operator.pagedrawer.SetLineCapStyle
-K=org.apache.pdfbox.util.operator.SetStrokingCMYKColor
-k=org.apache.pdfbox.util.operator.SetNonStrokingCMYKColor
-l#=org.apache.pdfbox.util.operator.pagedrawer.LineTo
-m#=org.apache.pdfbox.util.operator.pagedrawer.MoveTo
-M#=org.apache.pdfbox.util.operator.pagedrawer.SetLineMiterLimit
-#MP org.apache.pdfbox.util.operator.NotImplemented ##Marked Content Point-- section 10.5
-n#=org.apache.pdfbox.util.operator.pagedrawer.EndPath
-q=org.apache.pdfbox.util.operator.GSave
-Q=org.apache.pdfbox.util.operator.GRestore
-re#=org.apache.pdfbox.util.operator.pagedrawer.AppendRectangleToPath
-RG=org.apache.pdfbox.util.operator.SetStrokingRGBColor
-rg=org.apache.pdfbox.util.operator.SetNonStrokingRGBColor
-#ri org.apache.pdfbox.util.operator.NotImplemented
-s=org.apache.pdfbox.util.operator.CloseAndStrokePath
-S#=org.apache.pdfbox.util.operator.pagedrawer.StrokePath
-SC=org.apache.pdfbox.util.operator.SetStrokingColor
-sc=org.apache.pdfbox.util.operator.SetNonStrokingColor
-SCN=org.apache.pdfbox.util.operator.SetStrokingColor
-scn=org.apache.pdfbox.util.operator.SetNonStrokingColor
-sh#=org.apache.pdfbox.util.operator.pagedrawer.SHFill
-T*=org.apache.pdfbox.util.operator.NextLine
-Tc=org.apache.pdfbox.util.operator.SetCharSpacing
-Td=org.apache.pdfbox.util.operator.MoveText
-TD=org.apache.pdfbox.util.operator.MoveTextSetLeading
-Tf=org.apache.pdfbox.util.operator.SetTextFont
-Tj=org.apache.pdfbox.util.operator.ShowText
-TJ=org.apache.pdfbox.util.operator.ShowTextGlyph
-TL=org.apache.pdfbox.util.operator.SetTextLeading
-Tm=org.apache.pdfbox.util.operator.SetMatrix
-Tr=org.apache.pdfbox.util.operator.SetTextRenderingMode
-Ts=org.apache.pdfbox.util.operator.SetTextRise
-Tw=org.apache.pdfbox.util.operator.SetWordSpacing
-Tz=org.apache.pdfbox.util.operator.SetHorizontalTextScaling
-v#=org.apache.pdfbox.util.operator.pagedrawer.CurveToReplicateInitialPoint
-w#=org.apache.pdfbox.util.operator.pagedrawer.SetLineWidth
-W# org.apache.pdfbox.util.operator.pagedrawer.ClipNonZeroRule
-W*# org.apache.pdfbox.util.operator.pagedrawer.ClipEvenOddRule
-y#=org.apache.pdfbox.util.operator.pagedrawer.CurveToReplicateFinalPoint
-\'=org.apache.pdfbox.util.operator.MoveAndShow
-\"=org.apache.pdfbox.util.operator.SetMoveAndShow
+
+# This table is maps PDF stream operators to concrete OperatorProcessor
+# subclasses that are used by the PDFStreamEngine class to interpret the
+# PDF document. The classes configured here allow the PDFTextStripper
+# subclass of PDFStreamEngine to extract text content of the document.
+
+BT = org.apache.pdfbox.util.operator.BeginText
+cm = org.apache.pdfbox.util.operator.Concatenate
+Do = org.apache.pdfbox.util.operator.Invoke
+ET = org.apache.pdfbox.util.operator.EndText
+gs = org.apache.pdfbox.util.operator.SetGraphicsStateParameters
+q  = org.apache.pdfbox.util.operator.GSave
+Q  = org.apache.pdfbox.util.operator.GRestore
+T* = org.apache.pdfbox.util.operator.NextLine
+Tc = org.apache.pdfbox.util.operator.SetCharSpacing
+Td = org.apache.pdfbox.util.operator.MoveText
+TD = org.apache.pdfbox.util.operator.MoveTextSetLeading
+Tf = org.apache.pdfbox.util.operator.SetTextFont
+Tj = org.apache.pdfbox.util.operator.ShowText
+TJ = org.apache.pdfbox.util.operator.ShowTextGlyph
+TL = org.apache.pdfbox.util.operator.SetTextLeading
+Tm = org.apache.pdfbox.util.operator.SetMatrix
+Tr = org.apache.pdfbox.util.operator.SetTextRenderingMode
+Ts = org.apache.pdfbox.util.operator.SetTextRise
+Tw = org.apache.pdfbox.util.operator.SetWordSpacing
+Tz = org.apache.pdfbox.util.operator.SetHorizontalTextScaling
+w  = org.apache.pdfbox.util.operator.SetLineWidth
+\' = org.apache.pdfbox.util.operator.MoveAndShow
+\" = org.apache.pdfbox.util.operator.SetMoveAndShow
+
+# The following operators are not relevant to text extraction,
+# so we can silently ignore them.
+
+b
+B
+b*
+B*
+BDC
+BI
+BMC
+BX
+c
+CS
+cs
+d
+d0
+d1
+DP
+El
+EMC
+EX
+f
+F
+f*
+G
+g
+h
+i
+ID
+j
+J
+K
+k
+l
+m
+M
+MP
+n
+re
+RG
+rg
+ri
+s
+S
+SC
+sc
+SCN
+scn
+sh
+v
+W
+W*
+y
index a03348b5366cd30a83f15219d1f12b48b8f100df..a39d25740bf44230e8e95451e97ee109e2ba7696 100644 (file)
Binary files a/resources/tools/fwstk/out/artifacts/fwstk_jar/fwstk.jar and b/resources/tools/fwstk/out/artifacts/fwstk_jar/fwstk.jar differ
index 6938011f570cd63ffe097d2480b42f8d333590a1..13a585079c6f431e132c6fb552cbe3efd9e2a54d 100644 (file)
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-#
-# this Table is a correspondance Map of the PDF stream operators with concretes class of the
-# OperatorProcessor abstract class for the stategy pattern used in the 
-# org.apache.pdfbox.util.PDFStreamEngine class.
-# To change the behaviour of the system, remplace the class name by a new class name.
-b#=org.apache.pdfbox.util.operator.pagedrawer.CloseFillNonZeroAndStrokePath
-B#=org.apache.pdfbox.util.operator.pagedrawer.FillNonZeroAndStrokePath
-b*#=org.apache.pdfbox.util.operator.pagedrawer.CloseFillEvenOddAndStrokePath
-B*#=org.apache.pdfbox.util.operator.pagedrawer.FillEvenOddAndStrokePath
-#BDC org.apache.pdfbox.util.operator.NotImplemented ##Begin Marked Content -- section 10.5
-BI#=org.apache.pdfbox.util.operator.pagedrawer.BeginInlineImage
-#BMC org.apache.pdfbox.util.operator.NotImplemented ##Begin Marked Content -- section 10.5
-BT=org.apache.pdfbox.util.operator.BeginText
-#BX org.apache.pdfbox.util.operator.NotImplemented
-c#=org.apache.pdfbox.util.operator.pagedrawer.CurveTo
-cm=org.apache.pdfbox.util.operator.Concatenate
-CS=org.apache.pdfbox.util.operator.SetStrokingColorSpace
-cs=org.apache.pdfbox.util.operator.SetNonStrokingColorSpace
-d#=org.apache.pdfbox.util.operator.pagedrawer.SetLineDashPattern
-#d0 org.apache.pdfbox.util.operator.NotImplemented
-#d1 org.apache.pdfbox.util.operator.NotImplemented
-Do#=org.apache.pdfbox.util.operator.pagedrawer.Invoke
-#DP org.apache.pdfbox.util.operator.NotImplemented ##Marked Content Point-- section 10.5
-#El org.apache.pdfbox.util.operator.NotImplemented
-#EMC org.apache.pdfbox.util.operator.NotImplemented ##End Marked Content -- section 10.5
-ET=org.apache.pdfbox.util.operator.EndText
-#EX org.apache.pdfbox.util.operator.NotImplemented
-f#=org.apache.pdfbox.util.operator.pagedrawer.FillNonZeroRule
-F#=org.apache.pdfbox.util.operator.pagedrawer.FillNonZeroRule
-f*#=org.apache.pdfbox.util.operator.pagedrawer.FillEvenOddRule
-G=org.apache.pdfbox.util.operator.SetStrokingGrayColor
-g=org.apache.pdfbox.util.operator.SetNonStrokingGrayColor
-gs=org.apache.pdfbox.util.operator.SetGraphicsStateParameters
-h#=org.apache.pdfbox.util.operator.pagedrawer.ClosePath
-#i org.apache.pdfbox.util.operator.NotImplemented
-#ID org.apache.pdfbox.util.operator.NotImplemented
-j#=org.apache.pdfbox.util.operator.pagedrawer.SetLineJoinStyle
-J#=org.apache.pdfbox.util.operator.pagedrawer.SetLineCapStyle
-K=org.apache.pdfbox.util.operator.SetStrokingCMYKColor
-k=org.apache.pdfbox.util.operator.SetNonStrokingCMYKColor
-l#=org.apache.pdfbox.util.operator.pagedrawer.LineTo
-m#=org.apache.pdfbox.util.operator.pagedrawer.MoveTo
-M#=org.apache.pdfbox.util.operator.pagedrawer.SetLineMiterLimit
-#MP org.apache.pdfbox.util.operator.NotImplemented ##Marked Content Point-- section 10.5
-n#=org.apache.pdfbox.util.operator.pagedrawer.EndPath
-q=org.apache.pdfbox.util.operator.GSave
-Q=org.apache.pdfbox.util.operator.GRestore
-re#=org.apache.pdfbox.util.operator.pagedrawer.AppendRectangleToPath
-RG=org.apache.pdfbox.util.operator.SetStrokingRGBColor
-rg=org.apache.pdfbox.util.operator.SetNonStrokingRGBColor
-#ri org.apache.pdfbox.util.operator.NotImplemented
-s=org.apache.pdfbox.util.operator.CloseAndStrokePath
-S#=org.apache.pdfbox.util.operator.pagedrawer.StrokePath
-SC=org.apache.pdfbox.util.operator.SetStrokingColor
-sc=org.apache.pdfbox.util.operator.SetNonStrokingColor
-SCN=org.apache.pdfbox.util.operator.SetStrokingColor
-scn=org.apache.pdfbox.util.operator.SetNonStrokingColor
-sh#=org.apache.pdfbox.util.operator.pagedrawer.SHFill
-T*=org.apache.pdfbox.util.operator.NextLine
-Tc=org.apache.pdfbox.util.operator.SetCharSpacing
-Td=org.apache.pdfbox.util.operator.MoveText
-TD=org.apache.pdfbox.util.operator.MoveTextSetLeading
-Tf=org.apache.pdfbox.util.operator.SetTextFont
-Tj=org.apache.pdfbox.util.operator.ShowText
-TJ=org.apache.pdfbox.util.operator.ShowTextGlyph
-TL=org.apache.pdfbox.util.operator.SetTextLeading
-Tm=org.apache.pdfbox.util.operator.SetMatrix
-Tr=org.apache.pdfbox.util.operator.SetTextRenderingMode
-Ts=org.apache.pdfbox.util.operator.SetTextRise
-Tw=org.apache.pdfbox.util.operator.SetWordSpacing
-Tz=org.apache.pdfbox.util.operator.SetHorizontalTextScaling
-v#=org.apache.pdfbox.util.operator.pagedrawer.CurveToReplicateInitialPoint
-w#=org.apache.pdfbox.util.operator.pagedrawer.SetLineWidth
-W# org.apache.pdfbox.util.operator.pagedrawer.ClipNonZeroRule
-W*# org.apache.pdfbox.util.operator.pagedrawer.ClipEvenOddRule
-y#=org.apache.pdfbox.util.operator.pagedrawer.CurveToReplicateFinalPoint
-\'=org.apache.pdfbox.util.operator.MoveAndShow
-\"=org.apache.pdfbox.util.operator.SetMoveAndShow
+
+# This table is maps PDF stream operators to concrete OperatorProcessor
+# subclasses that are used by the PDFStreamEngine class to interpret the
+# PDF document. The classes configured here allow the PDFTextStripper
+# subclass of PDFStreamEngine to extract text content of the document.
+
+BT = org.apache.pdfbox.util.operator.BeginText
+cm = org.apache.pdfbox.util.operator.Concatenate
+Do = org.apache.pdfbox.util.operator.Invoke
+ET = org.apache.pdfbox.util.operator.EndText
+gs = org.apache.pdfbox.util.operator.SetGraphicsStateParameters
+q  = org.apache.pdfbox.util.operator.GSave
+Q  = org.apache.pdfbox.util.operator.GRestore
+T* = org.apache.pdfbox.util.operator.NextLine
+Tc = org.apache.pdfbox.util.operator.SetCharSpacing
+Td = org.apache.pdfbox.util.operator.MoveText
+TD = org.apache.pdfbox.util.operator.MoveTextSetLeading
+Tf = org.apache.pdfbox.util.operator.SetTextFont
+Tj = org.apache.pdfbox.util.operator.ShowText
+TJ = org.apache.pdfbox.util.operator.ShowTextGlyph
+TL = org.apache.pdfbox.util.operator.SetTextLeading
+Tm = org.apache.pdfbox.util.operator.SetMatrix
+Tr = org.apache.pdfbox.util.operator.SetTextRenderingMode
+Ts = org.apache.pdfbox.util.operator.SetTextRise
+Tw = org.apache.pdfbox.util.operator.SetWordSpacing
+Tz = org.apache.pdfbox.util.operator.SetHorizontalTextScaling
+w  = org.apache.pdfbox.util.operator.SetLineWidth
+\' = org.apache.pdfbox.util.operator.MoveAndShow
+\" = org.apache.pdfbox.util.operator.SetMoveAndShow
+
+# The following operators are not relevant to text extraction,
+# so we can silently ignore them.
+
+b
+B
+b*
+B*
+BDC
+BI
+BMC
+BX
+c
+CS
+cs
+d
+d0
+d1
+DP
+El
+EMC
+EX
+f
+F
+f*
+G
+g
+h
+i
+ID
+j
+J
+K
+k
+l
+m
+M
+MP
+n
+re
+RG
+rg
+ri
+s
+S
+SC
+sc
+SCN
+scn
+sh
+v
+W
+W*
+y
index 6938011f570cd63ffe097d2480b42f8d333590a1..13a585079c6f431e132c6fb552cbe3efd9e2a54d 100644 (file)
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-#
-# this Table is a correspondance Map of the PDF stream operators with concretes class of the
-# OperatorProcessor abstract class for the stategy pattern used in the 
-# org.apache.pdfbox.util.PDFStreamEngine class.
-# To change the behaviour of the system, remplace the class name by a new class name.
-b#=org.apache.pdfbox.util.operator.pagedrawer.CloseFillNonZeroAndStrokePath
-B#=org.apache.pdfbox.util.operator.pagedrawer.FillNonZeroAndStrokePath
-b*#=org.apache.pdfbox.util.operator.pagedrawer.CloseFillEvenOddAndStrokePath
-B*#=org.apache.pdfbox.util.operator.pagedrawer.FillEvenOddAndStrokePath
-#BDC org.apache.pdfbox.util.operator.NotImplemented ##Begin Marked Content -- section 10.5
-BI#=org.apache.pdfbox.util.operator.pagedrawer.BeginInlineImage
-#BMC org.apache.pdfbox.util.operator.NotImplemented ##Begin Marked Content -- section 10.5
-BT=org.apache.pdfbox.util.operator.BeginText
-#BX org.apache.pdfbox.util.operator.NotImplemented
-c#=org.apache.pdfbox.util.operator.pagedrawer.CurveTo
-cm=org.apache.pdfbox.util.operator.Concatenate
-CS=org.apache.pdfbox.util.operator.SetStrokingColorSpace
-cs=org.apache.pdfbox.util.operator.SetNonStrokingColorSpace
-d#=org.apache.pdfbox.util.operator.pagedrawer.SetLineDashPattern
-#d0 org.apache.pdfbox.util.operator.NotImplemented
-#d1 org.apache.pdfbox.util.operator.NotImplemented
-Do#=org.apache.pdfbox.util.operator.pagedrawer.Invoke
-#DP org.apache.pdfbox.util.operator.NotImplemented ##Marked Content Point-- section 10.5
-#El org.apache.pdfbox.util.operator.NotImplemented
-#EMC org.apache.pdfbox.util.operator.NotImplemented ##End Marked Content -- section 10.5
-ET=org.apache.pdfbox.util.operator.EndText
-#EX org.apache.pdfbox.util.operator.NotImplemented
-f#=org.apache.pdfbox.util.operator.pagedrawer.FillNonZeroRule
-F#=org.apache.pdfbox.util.operator.pagedrawer.FillNonZeroRule
-f*#=org.apache.pdfbox.util.operator.pagedrawer.FillEvenOddRule
-G=org.apache.pdfbox.util.operator.SetStrokingGrayColor
-g=org.apache.pdfbox.util.operator.SetNonStrokingGrayColor
-gs=org.apache.pdfbox.util.operator.SetGraphicsStateParameters
-h#=org.apache.pdfbox.util.operator.pagedrawer.ClosePath
-#i org.apache.pdfbox.util.operator.NotImplemented
-#ID org.apache.pdfbox.util.operator.NotImplemented
-j#=org.apache.pdfbox.util.operator.pagedrawer.SetLineJoinStyle
-J#=org.apache.pdfbox.util.operator.pagedrawer.SetLineCapStyle
-K=org.apache.pdfbox.util.operator.SetStrokingCMYKColor
-k=org.apache.pdfbox.util.operator.SetNonStrokingCMYKColor
-l#=org.apache.pdfbox.util.operator.pagedrawer.LineTo
-m#=org.apache.pdfbox.util.operator.pagedrawer.MoveTo
-M#=org.apache.pdfbox.util.operator.pagedrawer.SetLineMiterLimit
-#MP org.apache.pdfbox.util.operator.NotImplemented ##Marked Content Point-- section 10.5
-n#=org.apache.pdfbox.util.operator.pagedrawer.EndPath
-q=org.apache.pdfbox.util.operator.GSave
-Q=org.apache.pdfbox.util.operator.GRestore
-re#=org.apache.pdfbox.util.operator.pagedrawer.AppendRectangleToPath
-RG=org.apache.pdfbox.util.operator.SetStrokingRGBColor
-rg=org.apache.pdfbox.util.operator.SetNonStrokingRGBColor
-#ri org.apache.pdfbox.util.operator.NotImplemented
-s=org.apache.pdfbox.util.operator.CloseAndStrokePath
-S#=org.apache.pdfbox.util.operator.pagedrawer.StrokePath
-SC=org.apache.pdfbox.util.operator.SetStrokingColor
-sc=org.apache.pdfbox.util.operator.SetNonStrokingColor
-SCN=org.apache.pdfbox.util.operator.SetStrokingColor
-scn=org.apache.pdfbox.util.operator.SetNonStrokingColor
-sh#=org.apache.pdfbox.util.operator.pagedrawer.SHFill
-T*=org.apache.pdfbox.util.operator.NextLine
-Tc=org.apache.pdfbox.util.operator.SetCharSpacing
-Td=org.apache.pdfbox.util.operator.MoveText
-TD=org.apache.pdfbox.util.operator.MoveTextSetLeading
-Tf=org.apache.pdfbox.util.operator.SetTextFont
-Tj=org.apache.pdfbox.util.operator.ShowText
-TJ=org.apache.pdfbox.util.operator.ShowTextGlyph
-TL=org.apache.pdfbox.util.operator.SetTextLeading
-Tm=org.apache.pdfbox.util.operator.SetMatrix
-Tr=org.apache.pdfbox.util.operator.SetTextRenderingMode
-Ts=org.apache.pdfbox.util.operator.SetTextRise
-Tw=org.apache.pdfbox.util.operator.SetWordSpacing
-Tz=org.apache.pdfbox.util.operator.SetHorizontalTextScaling
-v#=org.apache.pdfbox.util.operator.pagedrawer.CurveToReplicateInitialPoint
-w#=org.apache.pdfbox.util.operator.pagedrawer.SetLineWidth
-W# org.apache.pdfbox.util.operator.pagedrawer.ClipNonZeroRule
-W*# org.apache.pdfbox.util.operator.pagedrawer.ClipEvenOddRule
-y#=org.apache.pdfbox.util.operator.pagedrawer.CurveToReplicateFinalPoint
-\'=org.apache.pdfbox.util.operator.MoveAndShow
-\"=org.apache.pdfbox.util.operator.SetMoveAndShow
+
+# This table is maps PDF stream operators to concrete OperatorProcessor
+# subclasses that are used by the PDFStreamEngine class to interpret the
+# PDF document. The classes configured here allow the PDFTextStripper
+# subclass of PDFStreamEngine to extract text content of the document.
+
+BT = org.apache.pdfbox.util.operator.BeginText
+cm = org.apache.pdfbox.util.operator.Concatenate
+Do = org.apache.pdfbox.util.operator.Invoke
+ET = org.apache.pdfbox.util.operator.EndText
+gs = org.apache.pdfbox.util.operator.SetGraphicsStateParameters
+q  = org.apache.pdfbox.util.operator.GSave
+Q  = org.apache.pdfbox.util.operator.GRestore
+T* = org.apache.pdfbox.util.operator.NextLine
+Tc = org.apache.pdfbox.util.operator.SetCharSpacing
+Td = org.apache.pdfbox.util.operator.MoveText
+TD = org.apache.pdfbox.util.operator.MoveTextSetLeading
+Tf = org.apache.pdfbox.util.operator.SetTextFont
+Tj = org.apache.pdfbox.util.operator.ShowText
+TJ = org.apache.pdfbox.util.operator.ShowTextGlyph
+TL = org.apache.pdfbox.util.operator.SetTextLeading
+Tm = org.apache.pdfbox.util.operator.SetMatrix
+Tr = org.apache.pdfbox.util.operator.SetTextRenderingMode
+Ts = org.apache.pdfbox.util.operator.SetTextRise
+Tw = org.apache.pdfbox.util.operator.SetWordSpacing
+Tz = org.apache.pdfbox.util.operator.SetHorizontalTextScaling
+w  = org.apache.pdfbox.util.operator.SetLineWidth
+\' = org.apache.pdfbox.util.operator.MoveAndShow
+\" = org.apache.pdfbox.util.operator.SetMoveAndShow
+
+# The following operators are not relevant to text extraction,
+# so we can silently ignore them.
+
+b
+B
+b*
+B*
+BDC
+BI
+BMC
+BX
+c
+CS
+cs
+d
+d0
+d1
+DP
+El
+EMC
+EX
+f
+F
+f*
+G
+g
+h
+i
+ID
+j
+J
+K
+k
+l
+m
+M
+MP
+n
+re
+RG
+rg
+ri
+s
+S
+SC
+sc
+SCN
+scn
+sh
+v
+W
+W*
+y
index 427e49a51525dd69de04720ff3fb9109c656a1bc..560c4ad3b9738f73596f72874ca8ba4a5a0ed268 100644 (file)
@@ -13,7 +13,6 @@ import java.nio.file.Paths;
 import java.util.*;
 
 import org.apache.commons.text.StringEscapeUtils;
-import org.apache.pdfbox.cos.COSName;
 import org.apache.pdfbox.exceptions.COSVisitorException;
 import org.apache.pdfbox.pdmodel.PDDocument;
 import org.apache.pdfbox.pdmodel.PDDocumentInformation;
index 5374acec2d834db9c9740f292dd3812404d1cfe8..9677c466ab1915fa31835837940ab1a3816dbcff 100644 (file)
@@ -1,11 +1,9 @@
 package com.fluidbook.fwstk.layout;
 
 import java.io.IOException;
-import java.util.List;
 import java.util.Properties;
 
 import org.apache.pdfbox.cos.COSStream;
-import org.apache.pdfbox.pdmodel.PDDocument;
 import org.apache.pdfbox.pdmodel.PDPage;
 import org.apache.pdfbox.pdmodel.PDResources;
 import org.apache.pdfbox.pdmodel.common.PDStream;