Author: lehmi
Date: Tue Mar  9 19:20:15 2010
New Revision: 921063

URL: http://svn.apache.org/viewvc?rev=921063&view=rev
Log:
PDFBOX-7: added support for the extraction of information of tagged pdfs. Patch 
by Johannes Koch (johannes dot koch at fit dot fraunhofer dot de)

Added:
    
pdfbox/trunk/src/main/resources/Resources/PDFMarkedContentExtractor.properties

Added: 
pdfbox/trunk/src/main/resources/Resources/PDFMarkedContentExtractor.properties
URL: 
http://svn.apache.org/viewvc/pdfbox/trunk/src/main/resources/Resources/PDFMarkedContentExtractor.properties?rev=921063&view=auto
==============================================================================
--- 
pdfbox/trunk/src/main/resources/Resources/PDFMarkedContentExtractor.properties 
(added)
+++ 
pdfbox/trunk/src/main/resources/Resources/PDFMarkedContentExtractor.properties 
Tue Mar  9 19:20:15 2010
@@ -0,0 +1,98 @@
+# Licensed to the Apache Software Foundation (ASF) under one or more
+# contributor license agreements.  See the NOTICE file distributed with
+# this work for additional information regarding copyright ownership.
+# The ASF licenses this file to You under the Apache License, Version 2.0
+# (the "License"); you may not use this file except in compliance with
+# the License.  You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# This table is maps PDF stream operators to concrete OperatorProcessor
+# subclasses that are used by the PDFStreamEngine class to interpret the
+# PDF document. The classes configured here allow the PDFTextStripper
+# subclass of PDFStreamEngine to extract text content of the document.
+
+BT = org.apache.pdfbox.util.operator.BeginText
+cm = org.apache.pdfbox.util.operator.Concatenate
+Do = org.apache.pdfbox.util.operator.Invoke
+ET = org.apache.pdfbox.util.operator.EndText
+gs = org.apache.pdfbox.util.operator.SetGraphicsStateParameters
+q  = org.apache.pdfbox.util.operator.GSave
+Q  = org.apache.pdfbox.util.operator.GRestore
+T* = org.apache.pdfbox.util.operator.NextLine
+Tc = org.apache.pdfbox.util.operator.SetCharSpacing
+Td = org.apache.pdfbox.util.operator.MoveText
+TD = org.apache.pdfbox.util.operator.MoveTextSetLeading
+Tf = org.apache.pdfbox.util.operator.SetTextFont
+Tj = org.apache.pdfbox.util.operator.ShowText
+TJ = org.apache.pdfbox.util.operator.ShowTextGlyph
+TL = org.apache.pdfbox.util.operator.SetTextLeading
+Tm = org.apache.pdfbox.util.operator.SetMatrix
+Tr = org.apache.pdfbox.util.operator.SetTextRenderingMode
+Ts = org.apache.pdfbox.util.operator.SetTextRise
+Tw = org.apache.pdfbox.util.operator.SetWordSpacing
+Tz = org.apache.pdfbox.util.operator.SetHorizontalTextScaling
+w  = org.apache.pdfbox.util.operator.SetLineWidth
+\' = org.apache.pdfbox.util.operator.MoveAndShow
+\" = org.apache.pdfbox.util.operator.SetMoveAndShow
+
+BDC = org.apache.pdfbox.util.operator.BeginMarkedContentSequenceWithProperties
+BMC = org.apache.pdfbox.util.operator.BeginMarkedContentSequence
+EMC = org.apache.pdfbox.util.operator.EndMarkedContentSequence
+
+# The following operators are not relevant to text extraction,
+# so we can silently ignore them.
+
+b
+B
+b*
+B*
+BI
+BX
+c
+CS
+cs
+d
+d0
+d1
+DP
+El
+EX
+f
+F
+f*
+G
+g
+h
+i
+ID
+j
+J
+K
+k
+l
+m
+M
+MP
+n
+re
+RG
+rg
+ri
+s
+S
+SC
+sc
+SCN
+scn
+sh
+v
+W
+W*
+y
\ No newline at end of file


Reply via email to