ODTPaser.py

# -*- coding: utf-8 -*-

from genshi.core import Markup
from django.utils.translation import ugettext_lazy as _
import bs4
import xml.etree.ElementTree as ET
from xml.sax.saxutils import escape as sax_escape
import datetime
import re

class ODTParser(object):
styles = []
forms = []
def __init__(self):
self.styles = []
self.forms = []

def xstr(self, s, safe=True, escape = True):
if type(s) == bool:
if s:
return unicode(_('Yes'))
else:
return unicode(_('No'))
if type(s) == datetime.datetime:
return s.strftime("%d/%m/%y %H:%M")
if not s:
return ''

if escape:
s = sax_escape(unicode(s))
if safe:
return Markup(s.replace('\n', '<text:line-break/>'))
else:
return s

def html(self, s):
s = unicode(s)
return Markup(self.HTML_to_ODT(s))
def HTML_to_ODT(self, s):
if not s or s == 'None':
return ''
input = bs4.BeautifulSoup(s).body
if not input:
return ''
output = ET.Element(u"root")
output.tail = u''
output.text = u''

output = self._HTML_to_ODT(input, output)

ret = ET.tostring(output, encoding='ascii', method='xml')
ret = u'\n'.join(ret.split(u'\n')[1:])
ret = ret.replace(u'<root>', '')
ret = ret.replace(u'</root>', '')
return ret

def _get_li_first_p(self, li):
for elem in li:
if elem.tag == 'text:p':
return elem
return ET.SubElement(li, "text:p")

def _HTML_to_ODT(self, input_current, output_current, current_style = []):
output_child = None
prev_output_child = None
prev_input_child = None
for input_child in input_current.contents:
if type(input_child) == bs4.element.Comment:
##on ignore les commentaires
continue
if type(input_child) == bs4.element.NavigableString:
from xml.sax.saxutils import escape, unescape
val = unicode(input_child)
if output_child is not None:
output_child.tail += val
elif prev_output_child is not None:
prev_output_child.tail += val
else:
##si le parent est un li et que je n'ai rien d'autres (ni p ni span) alors j'ajoute un p
##sinon l'odt ne sera pas valide
if output_current.tag == 'text:list-item':
p = self._get_li_first_p(output_current)
p.text = val
else:
output_current.text += val
else:
prev_output_child = output_child
output_child = None
new_current_style = current_style[:]

if output_current.tag == 'text:list-item' and input_child.name != 'p' and input_child.name != 'ul' and input_child.name != 'ol':
##si le parent est un li et que le premier item n'est pas un p/li/ul
##alors j'ajoute un p sinon ça ne fonctionnera pas
output_current = self._get_li_first_p(output_current)

if input_child.name == 'h1':
output_child = ET.SubElement(output_current, u"text:h")
output_child.set("text:style-name", "Heading_20_1")
output_child.set("text:outline-level", "1")
elif input_child.name == 'h2':
output_child = ET.SubElement(output_current, u"text:h")
output_child.set("text:style-name", "Heading_20_2")
output_child.set("text:outline-level", "2")
elif input_child.name == 'h3':
output_child = ET.SubElement(output_current, u"text:h")
output_child.set("text:style-name", "Heading_20_3")
output_child.set("text:outline-level", "3")
elif input_child.name == 'p':
output_child = ET.SubElement(output_current, u"text:p")
self._HTML_to_ODT_set_style_read_styles(input_child, new_current_style)
self._HTML_to_ODT_set_style(output_child, new_current_style)
elif input_child.name == 'br':
output_child = ET.SubElement(output_current, u"text:line-break")
elif input_child.name == 'strong':
output_child = ET.SubElement(output_current, u"text:span")
new_current_style.append('ACTECIL_BOLD')
self._HTML_to_ODT_set_style(output_child, new_current_style)
elif input_child.name == 'em':
output_child = ET.SubElement(output_current, u"text:span")
new_current_style.append('ACTECIL_EM')
self._HTML_to_ODT_set_style(output_child, new_current_style)
elif input_child.name == 'span':
output_child = ET.SubElement(output_current, u"text:span")
self._HTML_to_ODT_set_style_read_styles(input_child, new_current_style)
self._HTML_to_ODT_set_style(output_child, new_current_style)
elif input_child.name == 'ul' or input_child.name == 'ol':
if prev_input_child and prev_input_child.name == 'li':
# on est dans le cas où on a une liste dans une liste
# en HTML on a <ul><li></li><ul>...</ul></ul> (la sous liste est dehors du li)
# en ODT la sous-liste doit être dans le li <ul><li><ul>...</ul></li></ul>
#le parent ne doit donc pas être le parent actuel, mais le précédent li
output_child = ET.SubElement(prev_output_child, u"text:list")
else:
output_child = ET.SubElement(output_current, u"text:list")
self._HTML_to_ODT_set_style_read_styles(input_child, new_current_style)
self._HTML_to_ODT_set_style(output_child, new_current_style)
elif input_child.name == 'li':
output_child = ET.SubElement(output_current, u"text:list-item")
self._HTML_to_ODT_set_style_read_styles(input_child, new_current_style)
self._HTML_to_ODT_set_style(output_child, new_current_style)
elif input_child.name == 'a':
output_child = ET.SubElement(output_current, u"text:a")
output_child.set("xlink:href", input_child.attrs.get('href', ''))
else:
#tous les autres tags ne sont pas gérés
#on les remplace alors par un p si on est à la racine du document
#sinon par un span
if output_current.tag == 'root':
output_child = ET.SubElement(output_current, u"text:p")
else:
output_child = ET.SubElement(output_current, u"text:span")

output_child.tail = u''
output_child.text = u''
self._HTML_to_ODT(input_child, output_child, current_style=new_current_style)

prev_input_child = input_child
return output_current

def _HTML_to_ODT_set_style(self, elt, current_style):
if current_style:
styles = list(set(current_style[:]))
styles.sort()
self.styles.append({'tag': elt.tag, 'styles': styles })
style = u'_'.join(styles)
elt.set("text:style-name", style)

def _HTML_to_ODT_set_style_read_styles(self, elt, current_style):
html_styles = ' %s' % elt.attrs.get('style', None)
if html_styles:
if 'text-decoration: underline' in html_styles:
current_style.append('ACTECIL_UNDERLINE')
if 'text-align: justify' in html_styles:
current_style.append('ACTECIL_JUSTIFY')

value = re.match( r'.*[ ;"]color *: *([^; ]*) *;.*', html_styles)
if value:
##vérification qu'on n'a pas déjà un color dans le style
for item in current_style[:]:
tabs = item.split('__')
if tabs[0] == 'ACTECIL_COLOR':
current_style.remove(item)
current_style.append('ACTECIL_COLOR__%s' % value.group(1))
value = re.match( r'.*[ ;"]background-color *: *([^; ]*) *;.*', html_styles)
if value:
##vérification qu'on n'a pas déjà un color dans le style
for item in current_style[:]:
tabs = item.split('__')
if tabs[0] == 'ACTECIL_BACKGROUNDCOLOR':
current_style.remove(item)
current_style.append('ACTECIL_BACKGROUNDCOLOR__%s' % value.group(1))

def get_styles(self):
#on supprime les doublons
styles = []
for i in self.styles:
found = False
for j in styles:
if '_'.join(i['styles']) == '_'.join(j['styles']) and i['tag'] == j['tag']:
found = True
break

if not found:
styles.append(i)

xmlns = u'xmlns:officeooo="http://openoffice.org/2009/office" xmlns:fo="urn:oasis:names:tc:opendocument:xmlns:xsl-fo-compatible:1.0" xmlns:style="urn:oasis:names:tc:opendocument:xmlns:style:1.0"'

ret = []
for i in styles:
name = '_'.join(i['styles'])
s = ''
style_family = 'text'
style_property = 'text-properties'

if 'ACTECIL_BOLD' in i['styles']:
s += u' fo:font-weight="bold" style:font-weight-asian="bold" style:font-weight-complex="bold"'
if 'ACTECIL_EM' in i['styles']:
s += u' fo:font-style="italic" style:font-style-asian="italic" style:font-style-complex="italic"'
if 'ACTECIL_UNDERLINE' in i['styles']:
s += u' style:text-underline-style="solid" style:text-underline-width="auto" style:text-underline-color="font-color"'

for j in i['styles']:
if 'ACTECIL_COLOR__' in j:
value = j.replace('ACTECIL_COLOR__', '')
s += u' fo:color="%s"' % value
if 'ACTECIL_BACKGROUNDCOLOR__' in j:
value = j.replace('ACTECIL_BACKGROUNDCOLOR__', '')
s += u' fo:background-color="%s"' % value

if 'ACTECIL_TEXTAREA_STYLE' in i['styles']:
s += ' style:wrap="run-through" fo:background-color="#ffff99" fo:border="solid" style:number-wrapped-paragraphs="no-limit" style:vertical-pos="middle" style:vertical-rel="line" style:horizontal-pos="from-left" style:horizontal-rel="paragraph"'
style_family = 'graphic'
style_property = 'graphic-properties'

if 'ACTECIL_TEXTAREA_TEXT_STYLE' in i['styles']:
s += ' style:text-line-through-style="none" style:text-line-through-type="none" style:font-name="Liberation Sans" fo:font-size="12pt" fo:font-style="normal" style:text-underline-style="none" fo:font-weight="normal"'
style_family = 'paragraph'
style_property = 'text-properties'
if 'ACTECIL_RADIO_STYLE' in i['styles']:
#" fo:background-color="#ffff99"
s += ' xmlns:draw="urn:oasis:names:tc:opendocument:xmlns:drawing:1.0" draw:textarea-vertical-align="middle" style:wrap="run-through" style:number-wrapped-paragraphs="no-limit" style:vertical-pos="middle" style:vertical-rel="line" style:horizontal-pos="from-left" style:horizontal-rel="paragraph"'
style_family = 'graphic'
style_property = 'graphic-properties'
if 'ACTECIL_RADIO_TEXT_STYLE' in i['styles']:
s += ' style:text-line-through-style="none" style:text-line-through-type="none" style:font-name="Liberation Sans" fo:font-size="12pt" fo:font-style="normal" style:text-underline-style="none" fo:font-weight="normal"'
style_family = 'paragraph'
style_property = 'text-properties'

if i['tag'] == 'text:p' and 'ACTECIL_JUSTIFY' in i['styles']:
s += u' fo:text-align="justify" style:justify-single-word="false"'
style_family = 'paragraph'
style_property = 'paragraph-properties'

ret .append(u'<style:style %s style:name="%s" style:family="%s"><style:%s %s></style:%s></style:style>' % (xmlns, name, style_family, style_property, s, style_property) )

return ret

def form_textarea(self, id, value, width, height):
if not value:
value = ''

s = u'''
<form:textarea form:name="%s" form:control-implementation="ooo:com.sun.star.form.component.TextField" xml:id="ACTECILCONTROL%s" form:id="ACTECILCONTROL%s" form:current-value="%s" form:convert-empty-to-null="true">
<form:properties>
<form:property form:property-name="ControlTypeinMSO" office:value-type="float" office:value="0"/>
<form:property form:property-name="DefaultControl" office:value-type="string" office:string-value="com.sun.star.form.control.TextField"/>
<form:property form:property-name="MultiLine" office:value-type="boolean" office:boolean-value="true"/>
<form:property form:property-name="ObjIDinMSO" office:value-type="float" office:value=""/>
</form:properties>
</form:textarea>
''' % (id, id, id, sax_escape(value).replace('\n', '#!#ACTECILFORMNEWLINE#!#') )
self.forms.append(s)

s = '''
<text:p>
<draw:control draw:style-name="ACTECIL_TEXTAREA_STYLE" draw:text-style-name="ACTECIL_TEXTAREA_TEXT_STYLE" text:anchor-type="as-char" draw:z-index="1" svg:width="%s" svg:height="%s" draw:control="ACTECILCONTROL%s"/>
</text:p>
''' % (width, height, id)

self.styles.append({'tag': '', 'styles': ['ACTECIL_TEXTAREA_STYLE', ] })
self.styles.append({'tag': '', 'styles': ['ACTECIL_TEXTAREA_TEXT_STYLE', ] })

return Markup(s)

def form_radio(self, id, grp_id, label, is_selected, width, height):
if is_selected:
is_selected = 'true'
else:
is_selected = 'false'
s = '''
<form:radio form:name="%s" form:control-implementation="ooo:com.sun.star.form.component.RadioButton" xml:id="ACTECILCONTROL%s" form:id="ACTECILCONTROL%s" form:label="%s" form:current-selected="%s" formx:group-name="%s" form:image-position="center">
<form:properties>
<form:property form:property-name="ControlTypeinMSO" office:value-type="float" office:value="0"/>
<form:property form:property-name="DefaultControl" office:value-type="string" office:string-value="com.sun.star.form.control.RadioButton"/>
<form:property form:property-name="ObjIDinMSO" office:value-type="float" office:value=""/>
<form:property form:property-name="SecondaryRefValue" office:value-type="string" office:string-value=""/>
</form:properties>
</form:radio>
''' % (id, id, id, label, is_selected, grp_id)
self.forms.append(s)

s = '''
<text:p>
<draw:control text:anchor-type="as-char" draw:z-index="2" draw:style-name="ACTECIL_RADIO_STYLE" draw:text-style-name="ACTECIL_RADIO_TEXT_STYLE" svg:width="%s" svg:height="%s" draw:control="ACTECILCONTROL%s"/>
</text:p>
''' % (width, height, id)

self.styles.append({'tag': '', 'styles': ['ACTECIL_RADIO_STYLE', ] })
self.styles.append({'tag': '', 'styles': ['ACTECIL_RADIO_TEXT_STYLE', ] })

return Markup(s)

def form_checkbox(self, id, label, is_selected, width, height):
if is_selected:
is_selected = 'true'
else:
is_selected = 'false'
s = '''
<form:checkbox form:name="%s" form:control-implementation="ooo:com.sun.star.form.component.CheckBox" xml:id="ACTECILCONTROL%s" form:id="ACTECILCONTROL%s" form:label="%s" form:image-position="center" form:current-selected="%s">
<form:properties>
<form:property form:property-name="ControlTypeinMSO" office:value-type="float" office:value="0"/>
<form:property form:property-name="DefaultControl" office:value-type="string" office:string-value="com.sun.star.form.control.CheckBox"/>
<form:property form:property-name="ObjIDinMSO" office:value-type="float" office:value=""/>
<form:property form:property-name="SecondaryRefValue" office:value-type="string" office:string-value=""/>
</form:properties>
</form:checkbox>
''' % (id, id, id, label, is_selected)
self.forms.append(s)

s = '''
<text:p>
<draw:control text:anchor-type="as-char" svg:y="-0.093cm" draw:z-index="0" draw:style-name="ACTECIL_RADIO_STYLE" draw:text-style-name="ACTECIL_RADIO_TEXT_STYLE" svg:width="%s" svg:height="%s" draw:control="ACTECILCONTROL%s"/>
</text:p>
''' % (width, height, id)

self.styles.append({'tag': '', 'styles': ['ACTECIL_RADIO_STYLE', ] })
self.styles.append({'tag': '', 'styles': ['ACTECIL_RADIO_TEXT_STYLE', ] })

return Markup(s)
def get_forms(self):
if len(self.forms) == 0:
return ''

xmlns = u'xmlns:officeooo="http://openoffice.org/2009/office" xmlns:xlink="http://www.w3.org/1999/xlink" xmlns:office="urn:oasis:names:tc:opendocument:xmlns:office:1.0" xmlns:form="urn:oasis:names:tc:opendocument:xmlns:form:1.0" xmlns:formx="urn:openoffice:names:experimental:ooxml-odf-interop:xmlns:form:1.0"'

s = '''
<form:form %s form:name="ACTECIL_FORM" form:apply-filter="true" form:command-type="table" form:control-implementation="ooo:com.sun.star.form.component.Form" office:target-frame="" xlink:href="" xlink:type="simple">
%s
</form:form>
''' % (xmlns, '\n'.join(self.forms) )

return s

def get_controls_values(self, content_xml):
ret = {
'textarea': [],
'checkbox': [],
'radio': [],
}

# create needed namespaces
namespaces = dict(
text="urn:text",
draw="urn:draw",
table="urn:table",
office="urn:office",
xlink="urn:xlink",
svg="urn:svg",
manifest="urn:manifest",
)
namespaces.update(content_xml.getroot().nsmap)
namespaces.pop(None, None)

def tag_uri_and_name(elem):
if elem.tag[0] == "{":
uri, ignore, tag = elem.tag[1:].partition("}")
return '{%s}' % uri, tag
else:
uri = None
tag = elem.tag
return uri, tag
for field in content_xml.findall(".//form:textarea", namespaces=namespaces ):
uri, tag = tag_uri_and_name(field)
ret['textarea'].append(
{
'control_xml': field,
'name': field.attrib.get('%sname'%uri, '').split('__'),
'value': field.attrib.get('%scurrent-value'%uri, ''),
}
)

for field in content_xml.findall(".//form:radio", namespaces=namespaces ):
uri, tag = tag_uri_and_name(field)
ret['radio'].append(
{
'control_xml': field,
'name': field.attrib.get('%sname'%uri, '').split('__'),
'value': field.attrib.get('%scurrent-selected'%uri, 'false') == 'true',
}
)

for field in content_xml.findall(".//form:checkbox", namespaces=namespaces ):
uri, tag = tag_uri_and_name(field)
ret['checkbox'].append(
{
'control_xml': field,
'name': field.attrib.get('%sname'%uri, '').split('__'),
'value': field.attrib.get('%scurrent-selected'%uri, 'false') == 'true',
}
)

return ret
Publicités

1 réflexion à propos de “ ODTPaser.py ”

Laisser un commentaire

Entrez vos coordonnées ci-dessous ou cliquez sur une icône pour vous connecter:

Logo WordPress.com

Vous commentez à l'aide de votre compte WordPress.com. Déconnexion / Changer )

Image Twitter

Vous commentez à l'aide de votre compte Twitter. Déconnexion / Changer )

Photo Facebook

Vous commentez à l'aide de votre compte Facebook. Déconnexion / Changer )

Photo Google+

Vous commentez à l'aide de votre compte Google+. Déconnexion / Changer )

Connexion à %s