Author: lehmi
Date: Tue Mar 9 19:20:15 2010
New Revision: 921063
URL: http://svn.apache.org/viewvc?rev=921063&view=rev
Log:
PDFBOX-7: added support for the extraction of information of tagged pdfs. Patch
by Johannes Koch (johannes dot koch at fit dot fraunhofer dot de)
Added:
pdfbox/trunk/src/main/resources/Resources/PDFMarkedContentExtractor.properties
Added:
pdfbox/trunk/src/main/resources/Resources/PDFMarkedContentExtractor.properties
URL:
http://svn.apache.org/viewvc/pdfbox/trunk/src/main/resources/Resources/PDFMarkedContentExtractor.properties?rev=921063&view=auto
==============================================================================
---
pdfbox/trunk/src/main/resources/Resources/PDFMarkedContentExtractor.properties
(added)
+++
pdfbox/trunk/src/main/resources/Resources/PDFMarkedContentExtractor.properties
Tue Mar 9 19:20:15 2010
@@ -0,0 +1,98 @@
+# Licensed to the Apache Software Foundation (ASF) under one or more
+# contributor license agreements. See the NOTICE file distributed with
+# this work for additional information regarding copyright ownership.
+# The ASF licenses this file to You under the Apache License, Version 2.0
+# (the "License"); you may not use this file except in compliance with
+# the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# This table is maps PDF stream operators to concrete OperatorProcessor
+# subclasses that are used by the PDFStreamEngine class to interpret the
+# PDF document. The classes configured here allow the PDFTextStripper
+# subclass of PDFStreamEngine to extract text content of the document.
+
+BT = org.apache.pdfbox.util.operator.BeginText
+cm = org.apache.pdfbox.util.operator.Concatenate
+Do = org.apache.pdfbox.util.operator.Invoke
+ET = org.apache.pdfbox.util.operator.EndText
+gs = org.apache.pdfbox.util.operator.SetGraphicsStateParameters
+q = org.apache.pdfbox.util.operator.GSave
+Q = org.apache.pdfbox.util.operator.GRestore
+T* = org.apache.pdfbox.util.operator.NextLine
+Tc = org.apache.pdfbox.util.operator.SetCharSpacing
+Td = org.apache.pdfbox.util.operator.MoveText
+TD = org.apache.pdfbox.util.operator.MoveTextSetLeading
+Tf = org.apache.pdfbox.util.operator.SetTextFont
+Tj = org.apache.pdfbox.util.operator.ShowText
+TJ = org.apache.pdfbox.util.operator.ShowTextGlyph
+TL = org.apache.pdfbox.util.operator.SetTextLeading
+Tm = org.apache.pdfbox.util.operator.SetMatrix
+Tr = org.apache.pdfbox.util.operator.SetTextRenderingMode
+Ts = org.apache.pdfbox.util.operator.SetTextRise
+Tw = org.apache.pdfbox.util.operator.SetWordSpacing
+Tz = org.apache.pdfbox.util.operator.SetHorizontalTextScaling
+w = org.apache.pdfbox.util.operator.SetLineWidth
+\' = org.apache.pdfbox.util.operator.MoveAndShow
+\" = org.apache.pdfbox.util.operator.SetMoveAndShow
+
+BDC = org.apache.pdfbox.util.operator.BeginMarkedContentSequenceWithProperties
+BMC = org.apache.pdfbox.util.operator.BeginMarkedContentSequence
+EMC = org.apache.pdfbox.util.operator.EndMarkedContentSequence
+
+# The following operators are not relevant to text extraction,
+# so we can silently ignore them.
+
+b
+B
+b*
+B*
+BI
+BX
+c
+CS
+cs
+d
+d0
+d1
+DP
+El
+EX
+f
+F
+f*
+G
+g
+h
+i
+ID
+j
+J
+K
+k
+l
+m
+M
+MP
+n
+re
+RG
+rg
+ri
+s
+S
+SC
+sc
+SCN
+scn
+sh
+v
+W
+W*
+y
\ No newline at end of file