<component name="MarkdownSettingsMigration">
<option name="stateVersion" value="1" />
</component>
- <component name="PhpWorkspaceProjectConfiguration" interpreter_name="PHP 8.2" />
+ <component name="PhpWorkspaceProjectConfiguration" interpreter_name="PHP 8.5" />
<component name="ProjectColorInfo">{
"associatedIndex": 8
}</component>
<option name="hideEmptyMiddlePackages" value="true" />
<option name="showLibraryContents" value="true" />
</component>
- <component name="PropertiesComponent">{
- "keyToString": {
- "Application.extract links.executor": "Run",
- "RunOnceActivity.OpenProjectViewOnStart": "true",
- "RunOnceActivity.ShowReadmeOnStart": "true",
- "RunOnceActivity.git.unshallow": "true",
- "WebServerToolWindowFactoryState": "true",
- "WebServerToolWindowPanel.toolwindow.highlight.mappings": "true",
- "WebServerToolWindowPanel.toolwindow.highlight.symlinks": "true",
- "WebServerToolWindowPanel.toolwindow.show.date": "false",
- "WebServerToolWindowPanel.toolwindow.show.permissions": "false",
- "WebServerToolWindowPanel.toolwindow.show.size": "false",
- "git-widget-placeholder": "master",
- "ignore.virus.scanning.warn.message": "true",
- "junie.onboarding.icon.badge.shown": "true",
- "kotlin-language-version-configured": "true",
- "last_opened_file_path": "D:/Works/cubist_pdf/resources/tools/fwstk",
- "node.js.detected.package.eslint": "true",
- "node.js.detected.package.tslint": "true",
- "node.js.selected.package.eslint": "(autodetect)",
- "node.js.selected.package.tslint": "(autodetect)",
- "nodejs_package_manager_path": "npm",
- "project.structure.last.edited": "Libraries",
- "project.structure.proportion": "0.15",
- "project.structure.side.proportion": "0.2",
- "ruby.rails.projectView.checked": "true",
- "settings.editor.selected.configurable": "preferences.lookFeel",
- "vue.rearranger.settings.migration": "true"
+ <component name="PropertiesComponent"><![CDATA[{
+ "keyToString": {
+ "Application.extract links.executor": "Run",
+ "Application.extract texts.executor": "Run",
+ "RunOnceActivity.OpenProjectViewOnStart": "true",
+ "RunOnceActivity.ShowReadmeOnStart": "true",
+ "RunOnceActivity.git.unshallow": "true",
+ "RunOnceActivity.typescript.service.memoryLimit.init": "true",
+ "WebServerToolWindowFactoryState": "true",
+ "WebServerToolWindowPanel.toolwindow.highlight.mappings": "true",
+ "WebServerToolWindowPanel.toolwindow.highlight.symlinks": "true",
+ "WebServerToolWindowPanel.toolwindow.show.date": "false",
+ "WebServerToolWindowPanel.toolwindow.show.permissions": "false",
+ "WebServerToolWindowPanel.toolwindow.show.size": "false",
+ "git-widget-placeholder": "master",
+ "ignore.virus.scanning.warn.message": "true",
+ "junie.onboarding.icon.badge.shown": "true",
+ "kotlin-language-version-configured": "true",
+ "last_opened_file_path": "D:/Works/cubist_pdf/resources/tools/fwstk",
+ "node.js.detected.package.eslint": "true",
+ "node.js.detected.package.tslint": "true",
+ "node.js.selected.package.eslint": "(autodetect)",
+ "node.js.selected.package.tslint": "(autodetect)",
+ "nodejs_package_manager_path": "npm",
+ "project.structure.last.edited": "Libraries",
+ "project.structure.proportion": "0.15",
+ "project.structure.side.proportion": "0.2",
+ "ruby.rails.projectView.checked": "true",
+ "settings.editor.selected.configurable": "preferences.lookFeel",
+ "vue.rearranger.settings.migration": "true"
}
-}</component>
+}]]></component>
<component name="RecentsManager">
<key name="CopyFile.RECENT_KEYS">
<recent name="H:\Works\cubeExtranet\fluidbook\tools\fwstk\lib" />
<recent name="H:\Works\cubeExtranet\fluidbook\tools\fwstk" />
</key>
</component>
- <component name="RunManager" selected="Application.extract links">
+ <component name="RunManager" selected="Application.extract texts">
<configuration default="true" type="AndroidRunConfigurationType" factoryName="Android App">
<option name="DEPLOY" value="true" />
<option name="DEPLOY_APK_FROM_BUNDLE" value="false" />
<option name="USE_PATTERN" value="false" />
<method />
</configuration>
- <configuration default="true" type="tests" factoryName="Nosetests">
- <module name="fwstk" />
- <option name="INTERPRETER_OPTIONS" value="" />
- <option name="PARENT_ENVS" value="true" />
- <option name="SDK_HOME" value="" />
- <option name="WORKING_DIRECTORY" value="" />
- <option name="IS_MODULE_SDK" value="false" />
- <option name="ADD_CONTENT_ROOTS" value="true" />
- <option name="ADD_SOURCE_ROOTS" value="true" />
- <EXTENSION ID="PythonCoverageRunConfigurationExtension" runner="coverage.py" />
- <option name="_new_regexPattern" value="""" />
- <option name="_new_additionalArguments" value="""" />
- <option name="_new_target" value=""."" />
- <option name="_new_targetType" value=""PATH"" />
- <method v="2" />
- </configuration>
- <configuration default="true" type="tests" factoryName="Unittests">
- <module name="fwstk" />
- <option name="INTERPRETER_OPTIONS" value="" />
- <option name="PARENT_ENVS" value="true" />
- <option name="SDK_HOME" value="" />
- <option name="WORKING_DIRECTORY" value="" />
- <option name="IS_MODULE_SDK" value="false" />
- <option name="ADD_CONTENT_ROOTS" value="true" />
- <option name="ADD_SOURCE_ROOTS" value="true" />
- <EXTENSION ID="PythonCoverageRunConfigurationExtension" runner="coverage.py" />
- <option name="_new_additionalArguments" value="""" />
- <option name="_new_target" value=""."" />
- <option name="_new_targetType" value=""PATH"" />
- <method v="2" />
- </configuration>
- <configuration default="true" type="tests" factoryName="py.test">
- <module name="fwstk" />
- <option name="INTERPRETER_OPTIONS" value="" />
- <option name="PARENT_ENVS" value="true" />
- <option name="SDK_HOME" value="" />
- <option name="WORKING_DIRECTORY" value="" />
- <option name="IS_MODULE_SDK" value="false" />
- <option name="ADD_CONTENT_ROOTS" value="true" />
- <option name="ADD_SOURCE_ROOTS" value="true" />
- <EXTENSION ID="PythonCoverageRunConfigurationExtension" runner="coverage.py" />
- <option name="_new_keywords" value="""" />
- <option name="_new_parameters" value="""" />
- <option name="_new_additionalArguments" value="""" />
- <option name="_new_target" value=""."" />
- <option name="_new_targetType" value=""PATH"" />
- <method v="2" />
- </configuration>
<configuration name="extract layout" type="Application" factoryName="Application">
<option name="MAIN_CLASS_NAME" value="com.fluidbook.fwstk.Main" />
<module name="fwstk" />
<configuration name="extract texts" type="Application" factoryName="Application">
<option name="MAIN_CLASS_NAME" value="com.fluidbook.fwstk.Main" />
<module name="fwstk" />
- <option name="PROGRAM_PARAMETERS" value="--input C:\Users\vince\Desktop\20929.pdf --mode robust --extractTextsMethod fluidbook --extractTexts C:\Users\vince\Desktop\20929\%s%d.txt --threads 1" />
+ <option name="PROGRAM_PARAMETERS" value="--input C:\Users\vince\Downloads\fluidbook_33884.pdf --extractTextsMethod fluidbook --extractTexts C:\Users\vince\Desktop\33884\%s%d.txt --threads 1" />
<method v="2">
<option name="Make" enabled="true" />
</method>
<envs />
<method v="2" />
</configuration>
+ <configuration default="true" type="tests" factoryName="Nosetests">
+ <module name="fwstk" />
+ <option name="ENV_FILES" value="" />
+ <option name="INTERPRETER_OPTIONS" value="" />
+ <option name="PARENT_ENVS" value="true" />
+ <option name="SDK_HOME" value="" />
+ <option name="WORKING_DIRECTORY" value="" />
+ <option name="IS_MODULE_SDK" value="false" />
+ <option name="ADD_CONTENT_ROOTS" value="true" />
+ <option name="ADD_SOURCE_ROOTS" value="true" />
+ <EXTENSION ID="PythonCoverageRunConfigurationExtension" runner="coverage.py" />
+ <option name="RUN_TOOL" value="" />
+ <option name="_new_regexPattern" value="""" />
+ <option name="_new_additionalArguments" value="""" />
+ <option name="_new_target" value=""."" />
+ <option name="_new_targetType" value=""PATH"" />
+ <method v="2" />
+ </configuration>
+ <configuration default="true" type="tests" factoryName="Unittests">
+ <module name="fwstk" />
+ <option name="ENV_FILES" value="" />
+ <option name="INTERPRETER_OPTIONS" value="" />
+ <option name="PARENT_ENVS" value="true" />
+ <option name="SDK_HOME" value="" />
+ <option name="WORKING_DIRECTORY" value="" />
+ <option name="IS_MODULE_SDK" value="false" />
+ <option name="ADD_CONTENT_ROOTS" value="true" />
+ <option name="ADD_SOURCE_ROOTS" value="true" />
+ <EXTENSION ID="PythonCoverageRunConfigurationExtension" runner="coverage.py" />
+ <option name="RUN_TOOL" value="" />
+ <option name="_new_additionalArguments" value="""" />
+ <option name="_new_target" value=""."" />
+ <option name="_new_targetType" value=""PATH"" />
+ <method v="2" />
+ </configuration>
+ <configuration default="true" type="tests" factoryName="py.test">
+ <module name="fwstk" />
+ <option name="ENV_FILES" value="" />
+ <option name="INTERPRETER_OPTIONS" value="" />
+ <option name="PARENT_ENVS" value="true" />
+ <option name="SDK_HOME" value="" />
+ <option name="WORKING_DIRECTORY" value="" />
+ <option name="IS_MODULE_SDK" value="false" />
+ <option name="ADD_CONTENT_ROOTS" value="true" />
+ <option name="ADD_SOURCE_ROOTS" value="true" />
+ <EXTENSION ID="PythonCoverageRunConfigurationExtension" runner="coverage.py" />
+ <option name="RUN_TOOL" value="" />
+ <option name="_new_keywords" value="""" />
+ <option name="_new_parameters" value="""" />
+ <option name="_new_additionalArguments" value="""" />
+ <option name="_new_target" value=""."" />
+ <option name="_new_targetType" value=""PATH"" />
+ <method v="2" />
+ </configuration>
<list>
<item itemvalue="Application.extract layout" />
<item itemvalue="Application.extract links" />
<component name="SharedIndexes">
<attachedChunks>
<set>
- <option value="bundled-jdk-9823dce3aa75-fbdcb00ec9e3-intellij.indexing.shared.core-IU-251.25410.129" />
- <option value="bundled-js-predefined-d6986cc7102b-6a121458b545-JavaScript-IU-251.25410.129" />
+ <option value="bundled-jdk-30f59d01ecdd-2fc7cc6b9a17-intellij.indexing.shared.core-IU-253.30387.90" />
+ <option value="bundled-js-predefined-d6986cc7102b-9b0f141eb926-JavaScript-IU-253.30387.90" />
</set>
</attachedChunks>
</component>
<workItem from="1748351552423" duration="1932000" />
<workItem from="1748355409566" duration="1244000" />
<workItem from="1748356736199" duration="3367000" />
+ <workItem from="1769008557586" duration="495000" />
+ <workItem from="1769682693343" duration="1080000" />
</task>
<task id="LOCAL-00001" summary="wip #1111 @0.5">
<created>1487172253077</created>
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
-#
-# this Table is a correspondance Map of the PDF stream operators with concretes class of the
-# OperatorProcessor abstract class for the stategy pattern used in the
-# org.apache.pdfbox.util.PDFStreamEngine class.
-# To change the behaviour of the system, remplace the class name by a new class name.
-b#=org.apache.pdfbox.util.operator.pagedrawer.CloseFillNonZeroAndStrokePath
-B#=org.apache.pdfbox.util.operator.pagedrawer.FillNonZeroAndStrokePath
-b*#=org.apache.pdfbox.util.operator.pagedrawer.CloseFillEvenOddAndStrokePath
-B*#=org.apache.pdfbox.util.operator.pagedrawer.FillEvenOddAndStrokePath
-#BDC org.apache.pdfbox.util.operator.NotImplemented ##Begin Marked Content -- section 10.5
-BI#=org.apache.pdfbox.util.operator.pagedrawer.BeginInlineImage
-#BMC org.apache.pdfbox.util.operator.NotImplemented ##Begin Marked Content -- section 10.5
-BT=org.apache.pdfbox.util.operator.BeginText
-#BX org.apache.pdfbox.util.operator.NotImplemented
-c#=org.apache.pdfbox.util.operator.pagedrawer.CurveTo
-cm=org.apache.pdfbox.util.operator.Concatenate
-CS=org.apache.pdfbox.util.operator.SetStrokingColorSpace
-cs=org.apache.pdfbox.util.operator.SetNonStrokingColorSpace
-d#=org.apache.pdfbox.util.operator.pagedrawer.SetLineDashPattern
-#d0 org.apache.pdfbox.util.operator.NotImplemented
-#d1 org.apache.pdfbox.util.operator.NotImplemented
-Do#=org.apache.pdfbox.util.operator.pagedrawer.Invoke
-#DP org.apache.pdfbox.util.operator.NotImplemented ##Marked Content Point-- section 10.5
-#El org.apache.pdfbox.util.operator.NotImplemented
-#EMC org.apache.pdfbox.util.operator.NotImplemented ##End Marked Content -- section 10.5
-ET=org.apache.pdfbox.util.operator.EndText
-#EX org.apache.pdfbox.util.operator.NotImplemented
-f#=org.apache.pdfbox.util.operator.pagedrawer.FillNonZeroRule
-F#=org.apache.pdfbox.util.operator.pagedrawer.FillNonZeroRule
-f*#=org.apache.pdfbox.util.operator.pagedrawer.FillEvenOddRule
-G=org.apache.pdfbox.util.operator.SetStrokingGrayColor
-g=org.apache.pdfbox.util.operator.SetNonStrokingGrayColor
-gs=org.apache.pdfbox.util.operator.SetGraphicsStateParameters
-h#=org.apache.pdfbox.util.operator.pagedrawer.ClosePath
-#i org.apache.pdfbox.util.operator.NotImplemented
-#ID org.apache.pdfbox.util.operator.NotImplemented
-j#=org.apache.pdfbox.util.operator.pagedrawer.SetLineJoinStyle
-J#=org.apache.pdfbox.util.operator.pagedrawer.SetLineCapStyle
-K=org.apache.pdfbox.util.operator.SetStrokingCMYKColor
-k=org.apache.pdfbox.util.operator.SetNonStrokingCMYKColor
-l#=org.apache.pdfbox.util.operator.pagedrawer.LineTo
-m#=org.apache.pdfbox.util.operator.pagedrawer.MoveTo
-M#=org.apache.pdfbox.util.operator.pagedrawer.SetLineMiterLimit
-#MP org.apache.pdfbox.util.operator.NotImplemented ##Marked Content Point-- section 10.5
-n#=org.apache.pdfbox.util.operator.pagedrawer.EndPath
-q=org.apache.pdfbox.util.operator.GSave
-Q=org.apache.pdfbox.util.operator.GRestore
-re#=org.apache.pdfbox.util.operator.pagedrawer.AppendRectangleToPath
-RG=org.apache.pdfbox.util.operator.SetStrokingRGBColor
-rg=org.apache.pdfbox.util.operator.SetNonStrokingRGBColor
-#ri org.apache.pdfbox.util.operator.NotImplemented
-s=org.apache.pdfbox.util.operator.CloseAndStrokePath
-S#=org.apache.pdfbox.util.operator.pagedrawer.StrokePath
-SC=org.apache.pdfbox.util.operator.SetStrokingColor
-sc=org.apache.pdfbox.util.operator.SetNonStrokingColor
-SCN=org.apache.pdfbox.util.operator.SetStrokingColor
-scn=org.apache.pdfbox.util.operator.SetNonStrokingColor
-sh#=org.apache.pdfbox.util.operator.pagedrawer.SHFill
-T*=org.apache.pdfbox.util.operator.NextLine
-Tc=org.apache.pdfbox.util.operator.SetCharSpacing
-Td=org.apache.pdfbox.util.operator.MoveText
-TD=org.apache.pdfbox.util.operator.MoveTextSetLeading
-Tf=org.apache.pdfbox.util.operator.SetTextFont
-Tj=org.apache.pdfbox.util.operator.ShowText
-TJ=org.apache.pdfbox.util.operator.ShowTextGlyph
-TL=org.apache.pdfbox.util.operator.SetTextLeading
-Tm=org.apache.pdfbox.util.operator.SetMatrix
-Tr=org.apache.pdfbox.util.operator.SetTextRenderingMode
-Ts=org.apache.pdfbox.util.operator.SetTextRise
-Tw=org.apache.pdfbox.util.operator.SetWordSpacing
-Tz=org.apache.pdfbox.util.operator.SetHorizontalTextScaling
-v#=org.apache.pdfbox.util.operator.pagedrawer.CurveToReplicateInitialPoint
-w#=org.apache.pdfbox.util.operator.pagedrawer.SetLineWidth
-W# org.apache.pdfbox.util.operator.pagedrawer.ClipNonZeroRule
-W*# org.apache.pdfbox.util.operator.pagedrawer.ClipEvenOddRule
-y#=org.apache.pdfbox.util.operator.pagedrawer.CurveToReplicateFinalPoint
-\'=org.apache.pdfbox.util.operator.MoveAndShow
-\"=org.apache.pdfbox.util.operator.SetMoveAndShow
+
+# This table is maps PDF stream operators to concrete OperatorProcessor
+# subclasses that are used by the PDFStreamEngine class to interpret the
+# PDF document. The classes configured here allow the PDFTextStripper
+# subclass of PDFStreamEngine to extract text content of the document.
+
+BT = org.apache.pdfbox.util.operator.BeginText
+cm = org.apache.pdfbox.util.operator.Concatenate
+Do = org.apache.pdfbox.util.operator.Invoke
+ET = org.apache.pdfbox.util.operator.EndText
+gs = org.apache.pdfbox.util.operator.SetGraphicsStateParameters
+q = org.apache.pdfbox.util.operator.GSave
+Q = org.apache.pdfbox.util.operator.GRestore
+T* = org.apache.pdfbox.util.operator.NextLine
+Tc = org.apache.pdfbox.util.operator.SetCharSpacing
+Td = org.apache.pdfbox.util.operator.MoveText
+TD = org.apache.pdfbox.util.operator.MoveTextSetLeading
+Tf = org.apache.pdfbox.util.operator.SetTextFont
+Tj = org.apache.pdfbox.util.operator.ShowText
+TJ = org.apache.pdfbox.util.operator.ShowTextGlyph
+TL = org.apache.pdfbox.util.operator.SetTextLeading
+Tm = org.apache.pdfbox.util.operator.SetMatrix
+Tr = org.apache.pdfbox.util.operator.SetTextRenderingMode
+Ts = org.apache.pdfbox.util.operator.SetTextRise
+Tw = org.apache.pdfbox.util.operator.SetWordSpacing
+Tz = org.apache.pdfbox.util.operator.SetHorizontalTextScaling
+w = org.apache.pdfbox.util.operator.SetLineWidth
+\' = org.apache.pdfbox.util.operator.MoveAndShow
+\" = org.apache.pdfbox.util.operator.SetMoveAndShow
+
+# The following operators are not relevant to text extraction,
+# so we can silently ignore them.
+
+b
+B
+b*
+B*
+BDC
+BI
+BMC
+BX
+c
+CS
+cs
+d
+d0
+d1
+DP
+El
+EMC
+EX
+f
+F
+f*
+G
+g
+h
+i
+ID
+j
+J
+K
+k
+l
+m
+M
+MP
+n
+re
+RG
+rg
+ri
+s
+S
+SC
+sc
+SCN
+scn
+sh
+v
+W
+W*
+y
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
-#
-# this Table is a correspondance Map of the PDF stream operators with concretes class of the
-# OperatorProcessor abstract class for the stategy pattern used in the
-# org.apache.pdfbox.util.PDFStreamEngine class.
-# To change the behaviour of the system, remplace the class name by a new class name.
-b#=org.apache.pdfbox.util.operator.pagedrawer.CloseFillNonZeroAndStrokePath
-B#=org.apache.pdfbox.util.operator.pagedrawer.FillNonZeroAndStrokePath
-b*#=org.apache.pdfbox.util.operator.pagedrawer.CloseFillEvenOddAndStrokePath
-B*#=org.apache.pdfbox.util.operator.pagedrawer.FillEvenOddAndStrokePath
-#BDC org.apache.pdfbox.util.operator.NotImplemented ##Begin Marked Content -- section 10.5
-BI#=org.apache.pdfbox.util.operator.pagedrawer.BeginInlineImage
-#BMC org.apache.pdfbox.util.operator.NotImplemented ##Begin Marked Content -- section 10.5
-BT=org.apache.pdfbox.util.operator.BeginText
-#BX org.apache.pdfbox.util.operator.NotImplemented
-c#=org.apache.pdfbox.util.operator.pagedrawer.CurveTo
-cm=org.apache.pdfbox.util.operator.Concatenate
-CS=org.apache.pdfbox.util.operator.SetStrokingColorSpace
-cs=org.apache.pdfbox.util.operator.SetNonStrokingColorSpace
-d#=org.apache.pdfbox.util.operator.pagedrawer.SetLineDashPattern
-#d0 org.apache.pdfbox.util.operator.NotImplemented
-#d1 org.apache.pdfbox.util.operator.NotImplemented
-Do#=org.apache.pdfbox.util.operator.pagedrawer.Invoke
-#DP org.apache.pdfbox.util.operator.NotImplemented ##Marked Content Point-- section 10.5
-#El org.apache.pdfbox.util.operator.NotImplemented
-#EMC org.apache.pdfbox.util.operator.NotImplemented ##End Marked Content -- section 10.5
-ET=org.apache.pdfbox.util.operator.EndText
-#EX org.apache.pdfbox.util.operator.NotImplemented
-f#=org.apache.pdfbox.util.operator.pagedrawer.FillNonZeroRule
-F#=org.apache.pdfbox.util.operator.pagedrawer.FillNonZeroRule
-f*#=org.apache.pdfbox.util.operator.pagedrawer.FillEvenOddRule
-G=org.apache.pdfbox.util.operator.SetStrokingGrayColor
-g=org.apache.pdfbox.util.operator.SetNonStrokingGrayColor
-gs=org.apache.pdfbox.util.operator.SetGraphicsStateParameters
-h#=org.apache.pdfbox.util.operator.pagedrawer.ClosePath
-#i org.apache.pdfbox.util.operator.NotImplemented
-#ID org.apache.pdfbox.util.operator.NotImplemented
-j#=org.apache.pdfbox.util.operator.pagedrawer.SetLineJoinStyle
-J#=org.apache.pdfbox.util.operator.pagedrawer.SetLineCapStyle
-K=org.apache.pdfbox.util.operator.SetStrokingCMYKColor
-k=org.apache.pdfbox.util.operator.SetNonStrokingCMYKColor
-l#=org.apache.pdfbox.util.operator.pagedrawer.LineTo
-m#=org.apache.pdfbox.util.operator.pagedrawer.MoveTo
-M#=org.apache.pdfbox.util.operator.pagedrawer.SetLineMiterLimit
-#MP org.apache.pdfbox.util.operator.NotImplemented ##Marked Content Point-- section 10.5
-n#=org.apache.pdfbox.util.operator.pagedrawer.EndPath
-q=org.apache.pdfbox.util.operator.GSave
-Q=org.apache.pdfbox.util.operator.GRestore
-re#=org.apache.pdfbox.util.operator.pagedrawer.AppendRectangleToPath
-RG=org.apache.pdfbox.util.operator.SetStrokingRGBColor
-rg=org.apache.pdfbox.util.operator.SetNonStrokingRGBColor
-#ri org.apache.pdfbox.util.operator.NotImplemented
-s=org.apache.pdfbox.util.operator.CloseAndStrokePath
-S#=org.apache.pdfbox.util.operator.pagedrawer.StrokePath
-SC=org.apache.pdfbox.util.operator.SetStrokingColor
-sc=org.apache.pdfbox.util.operator.SetNonStrokingColor
-SCN=org.apache.pdfbox.util.operator.SetStrokingColor
-scn=org.apache.pdfbox.util.operator.SetNonStrokingColor
-sh#=org.apache.pdfbox.util.operator.pagedrawer.SHFill
-T*=org.apache.pdfbox.util.operator.NextLine
-Tc=org.apache.pdfbox.util.operator.SetCharSpacing
-Td=org.apache.pdfbox.util.operator.MoveText
-TD=org.apache.pdfbox.util.operator.MoveTextSetLeading
-Tf=org.apache.pdfbox.util.operator.SetTextFont
-Tj=org.apache.pdfbox.util.operator.ShowText
-TJ=org.apache.pdfbox.util.operator.ShowTextGlyph
-TL=org.apache.pdfbox.util.operator.SetTextLeading
-Tm=org.apache.pdfbox.util.operator.SetMatrix
-Tr=org.apache.pdfbox.util.operator.SetTextRenderingMode
-Ts=org.apache.pdfbox.util.operator.SetTextRise
-Tw=org.apache.pdfbox.util.operator.SetWordSpacing
-Tz=org.apache.pdfbox.util.operator.SetHorizontalTextScaling
-v#=org.apache.pdfbox.util.operator.pagedrawer.CurveToReplicateInitialPoint
-w#=org.apache.pdfbox.util.operator.pagedrawer.SetLineWidth
-W# org.apache.pdfbox.util.operator.pagedrawer.ClipNonZeroRule
-W*# org.apache.pdfbox.util.operator.pagedrawer.ClipEvenOddRule
-y#=org.apache.pdfbox.util.operator.pagedrawer.CurveToReplicateFinalPoint
-\'=org.apache.pdfbox.util.operator.MoveAndShow
-\"=org.apache.pdfbox.util.operator.SetMoveAndShow
+
+# This table is maps PDF stream operators to concrete OperatorProcessor
+# subclasses that are used by the PDFStreamEngine class to interpret the
+# PDF document. The classes configured here allow the PDFTextStripper
+# subclass of PDFStreamEngine to extract text content of the document.
+
+BT = org.apache.pdfbox.util.operator.BeginText
+cm = org.apache.pdfbox.util.operator.Concatenate
+Do = org.apache.pdfbox.util.operator.Invoke
+ET = org.apache.pdfbox.util.operator.EndText
+gs = org.apache.pdfbox.util.operator.SetGraphicsStateParameters
+q = org.apache.pdfbox.util.operator.GSave
+Q = org.apache.pdfbox.util.operator.GRestore
+T* = org.apache.pdfbox.util.operator.NextLine
+Tc = org.apache.pdfbox.util.operator.SetCharSpacing
+Td = org.apache.pdfbox.util.operator.MoveText
+TD = org.apache.pdfbox.util.operator.MoveTextSetLeading
+Tf = org.apache.pdfbox.util.operator.SetTextFont
+Tj = org.apache.pdfbox.util.operator.ShowText
+TJ = org.apache.pdfbox.util.operator.ShowTextGlyph
+TL = org.apache.pdfbox.util.operator.SetTextLeading
+Tm = org.apache.pdfbox.util.operator.SetMatrix
+Tr = org.apache.pdfbox.util.operator.SetTextRenderingMode
+Ts = org.apache.pdfbox.util.operator.SetTextRise
+Tw = org.apache.pdfbox.util.operator.SetWordSpacing
+Tz = org.apache.pdfbox.util.operator.SetHorizontalTextScaling
+w = org.apache.pdfbox.util.operator.SetLineWidth
+\' = org.apache.pdfbox.util.operator.MoveAndShow
+\" = org.apache.pdfbox.util.operator.SetMoveAndShow
+
+# The following operators are not relevant to text extraction,
+# so we can silently ignore them.
+
+b
+B
+b*
+B*
+BDC
+BI
+BMC
+BX
+c
+CS
+cs
+d
+d0
+d1
+DP
+El
+EMC
+EX
+f
+F
+f*
+G
+g
+h
+i
+ID
+j
+J
+K
+k
+l
+m
+M
+MP
+n
+re
+RG
+rg
+ri
+s
+S
+SC
+sc
+SCN
+scn
+sh
+v
+W
+W*
+y
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
-#
-# this Table is a correspondance Map of the PDF stream operators with concretes class of the
-# OperatorProcessor abstract class for the stategy pattern used in the
-# org.apache.pdfbox.util.PDFStreamEngine class.
-# To change the behaviour of the system, remplace the class name by a new class name.
-b#=org.apache.pdfbox.util.operator.pagedrawer.CloseFillNonZeroAndStrokePath
-B#=org.apache.pdfbox.util.operator.pagedrawer.FillNonZeroAndStrokePath
-b*#=org.apache.pdfbox.util.operator.pagedrawer.CloseFillEvenOddAndStrokePath
-B*#=org.apache.pdfbox.util.operator.pagedrawer.FillEvenOddAndStrokePath
-#BDC org.apache.pdfbox.util.operator.NotImplemented ##Begin Marked Content -- section 10.5
-BI#=org.apache.pdfbox.util.operator.pagedrawer.BeginInlineImage
-#BMC org.apache.pdfbox.util.operator.NotImplemented ##Begin Marked Content -- section 10.5
-BT=org.apache.pdfbox.util.operator.BeginText
-#BX org.apache.pdfbox.util.operator.NotImplemented
-c#=org.apache.pdfbox.util.operator.pagedrawer.CurveTo
-cm=org.apache.pdfbox.util.operator.Concatenate
-CS=org.apache.pdfbox.util.operator.SetStrokingColorSpace
-cs=org.apache.pdfbox.util.operator.SetNonStrokingColorSpace
-d#=org.apache.pdfbox.util.operator.pagedrawer.SetLineDashPattern
-#d0 org.apache.pdfbox.util.operator.NotImplemented
-#d1 org.apache.pdfbox.util.operator.NotImplemented
-Do#=org.apache.pdfbox.util.operator.pagedrawer.Invoke
-#DP org.apache.pdfbox.util.operator.NotImplemented ##Marked Content Point-- section 10.5
-#El org.apache.pdfbox.util.operator.NotImplemented
-#EMC org.apache.pdfbox.util.operator.NotImplemented ##End Marked Content -- section 10.5
-ET=org.apache.pdfbox.util.operator.EndText
-#EX org.apache.pdfbox.util.operator.NotImplemented
-f#=org.apache.pdfbox.util.operator.pagedrawer.FillNonZeroRule
-F#=org.apache.pdfbox.util.operator.pagedrawer.FillNonZeroRule
-f*#=org.apache.pdfbox.util.operator.pagedrawer.FillEvenOddRule
-G=org.apache.pdfbox.util.operator.SetStrokingGrayColor
-g=org.apache.pdfbox.util.operator.SetNonStrokingGrayColor
-gs=org.apache.pdfbox.util.operator.SetGraphicsStateParameters
-h#=org.apache.pdfbox.util.operator.pagedrawer.ClosePath
-#i org.apache.pdfbox.util.operator.NotImplemented
-#ID org.apache.pdfbox.util.operator.NotImplemented
-j#=org.apache.pdfbox.util.operator.pagedrawer.SetLineJoinStyle
-J#=org.apache.pdfbox.util.operator.pagedrawer.SetLineCapStyle
-K=org.apache.pdfbox.util.operator.SetStrokingCMYKColor
-k=org.apache.pdfbox.util.operator.SetNonStrokingCMYKColor
-l#=org.apache.pdfbox.util.operator.pagedrawer.LineTo
-m#=org.apache.pdfbox.util.operator.pagedrawer.MoveTo
-M#=org.apache.pdfbox.util.operator.pagedrawer.SetLineMiterLimit
-#MP org.apache.pdfbox.util.operator.NotImplemented ##Marked Content Point-- section 10.5
-n#=org.apache.pdfbox.util.operator.pagedrawer.EndPath
-q=org.apache.pdfbox.util.operator.GSave
-Q=org.apache.pdfbox.util.operator.GRestore
-re#=org.apache.pdfbox.util.operator.pagedrawer.AppendRectangleToPath
-RG=org.apache.pdfbox.util.operator.SetStrokingRGBColor
-rg=org.apache.pdfbox.util.operator.SetNonStrokingRGBColor
-#ri org.apache.pdfbox.util.operator.NotImplemented
-s=org.apache.pdfbox.util.operator.CloseAndStrokePath
-S#=org.apache.pdfbox.util.operator.pagedrawer.StrokePath
-SC=org.apache.pdfbox.util.operator.SetStrokingColor
-sc=org.apache.pdfbox.util.operator.SetNonStrokingColor
-SCN=org.apache.pdfbox.util.operator.SetStrokingColor
-scn=org.apache.pdfbox.util.operator.SetNonStrokingColor
-sh#=org.apache.pdfbox.util.operator.pagedrawer.SHFill
-T*=org.apache.pdfbox.util.operator.NextLine
-Tc=org.apache.pdfbox.util.operator.SetCharSpacing
-Td=org.apache.pdfbox.util.operator.MoveText
-TD=org.apache.pdfbox.util.operator.MoveTextSetLeading
-Tf=org.apache.pdfbox.util.operator.SetTextFont
-Tj=org.apache.pdfbox.util.operator.ShowText
-TJ=org.apache.pdfbox.util.operator.ShowTextGlyph
-TL=org.apache.pdfbox.util.operator.SetTextLeading
-Tm=org.apache.pdfbox.util.operator.SetMatrix
-Tr=org.apache.pdfbox.util.operator.SetTextRenderingMode
-Ts=org.apache.pdfbox.util.operator.SetTextRise
-Tw=org.apache.pdfbox.util.operator.SetWordSpacing
-Tz=org.apache.pdfbox.util.operator.SetHorizontalTextScaling
-v#=org.apache.pdfbox.util.operator.pagedrawer.CurveToReplicateInitialPoint
-w#=org.apache.pdfbox.util.operator.pagedrawer.SetLineWidth
-W# org.apache.pdfbox.util.operator.pagedrawer.ClipNonZeroRule
-W*# org.apache.pdfbox.util.operator.pagedrawer.ClipEvenOddRule
-y#=org.apache.pdfbox.util.operator.pagedrawer.CurveToReplicateFinalPoint
-\'=org.apache.pdfbox.util.operator.MoveAndShow
-\"=org.apache.pdfbox.util.operator.SetMoveAndShow
+
+# This table is maps PDF stream operators to concrete OperatorProcessor
+# subclasses that are used by the PDFStreamEngine class to interpret the
+# PDF document. The classes configured here allow the PDFTextStripper
+# subclass of PDFStreamEngine to extract text content of the document.
+
+BT = org.apache.pdfbox.util.operator.BeginText
+cm = org.apache.pdfbox.util.operator.Concatenate
+Do = org.apache.pdfbox.util.operator.Invoke
+ET = org.apache.pdfbox.util.operator.EndText
+gs = org.apache.pdfbox.util.operator.SetGraphicsStateParameters
+q = org.apache.pdfbox.util.operator.GSave
+Q = org.apache.pdfbox.util.operator.GRestore
+T* = org.apache.pdfbox.util.operator.NextLine
+Tc = org.apache.pdfbox.util.operator.SetCharSpacing
+Td = org.apache.pdfbox.util.operator.MoveText
+TD = org.apache.pdfbox.util.operator.MoveTextSetLeading
+Tf = org.apache.pdfbox.util.operator.SetTextFont
+Tj = org.apache.pdfbox.util.operator.ShowText
+TJ = org.apache.pdfbox.util.operator.ShowTextGlyph
+TL = org.apache.pdfbox.util.operator.SetTextLeading
+Tm = org.apache.pdfbox.util.operator.SetMatrix
+Tr = org.apache.pdfbox.util.operator.SetTextRenderingMode
+Ts = org.apache.pdfbox.util.operator.SetTextRise
+Tw = org.apache.pdfbox.util.operator.SetWordSpacing
+Tz = org.apache.pdfbox.util.operator.SetHorizontalTextScaling
+w = org.apache.pdfbox.util.operator.SetLineWidth
+\' = org.apache.pdfbox.util.operator.MoveAndShow
+\" = org.apache.pdfbox.util.operator.SetMoveAndShow
+
+# The following operators are not relevant to text extraction,
+# so we can silently ignore them.
+
+b
+B
+b*
+B*
+BDC
+BI
+BMC
+BX
+c
+CS
+cs
+d
+d0
+d1
+DP
+El
+EMC
+EX
+f
+F
+f*
+G
+g
+h
+i
+ID
+j
+J
+K
+k
+l
+m
+M
+MP
+n
+re
+RG
+rg
+ri
+s
+S
+SC
+sc
+SCN
+scn
+sh
+v
+W
+W*
+y
import java.util.*;
import org.apache.commons.text.StringEscapeUtils;
-import org.apache.pdfbox.cos.COSName;
import org.apache.pdfbox.exceptions.COSVisitorException;
import org.apache.pdfbox.pdmodel.PDDocument;
import org.apache.pdfbox.pdmodel.PDDocumentInformation;
package com.fluidbook.fwstk.layout;
import java.io.IOException;
-import java.util.List;
import java.util.Properties;
import org.apache.pdfbox.cos.COSStream;
-import org.apache.pdfbox.pdmodel.PDDocument;
import org.apache.pdfbox.pdmodel.PDPage;
import org.apache.pdfbox.pdmodel.PDResources;
import org.apache.pdfbox.pdmodel.common.PDStream;