Commit 0467908d authored by Neeme Kahusk's avatar Neeme Kahusk

polaris2xml batch usage

parent 91cc7c2f
......@@ -3,6 +3,8 @@
"""Converts ewn Polaris format to xml
for i in ../kb*/kb*utf8.norm ; do j=`echo $i|sed -E 's/[.][.][/]kb[0-9][0-9][/](kb[0-9][0-9][-]utf8)[.]norm/\1.xml/g'`;n=`echo $j|sed -E 's/kb([0-9][0-9])[-].*/\1/1'` ; echo $i $j $n ;~/GIT/newn/src/newn/polaris2xml.py -n kb -v $n -l est $i $j ; done
for i in ../kb[0-9][0-9]/kb[0-9][0-9]-utf8.txt ; do j=`echo $i|sed -E 's/[.][.][/]kb[0-9]{2}[/](kb[0-9]{2}[-]utf8)[.]txt/\1.xml/g'`;n=`echo $j|sed -E 's/kb([0-9]{2})[-].*/\1/1'` ; echo $i $j $n ;~/GIT/newn/src/newn/polaris2xml.py -n kb -v $n -l est $i $j ; done
"""
......@@ -19,7 +21,13 @@ def read2xml(read: list, lexattrs: dict = {}) -> etree._ElementTree:
level = -1
attrs = {}
for rida in read:
if len(rida) >= 2:
if len(rida) == 1:
tag = 'COMMENT'
element = etree.Element(tag)
element.text = rida[0].strip('# ')
parent.append(element)
currentlevel = level
elif len(rida) >= 2:
currentlevel = int(rida[0])
if currentlevel == 0:
parent = root
......@@ -38,7 +46,11 @@ def read2xml(read: list, lexattrs: dict = {}) -> etree._ElementTree:
txt = rida[-1]
if txt.startswith('"'):
txt = txt[1:-1].strip()
element.text = txt
try:
element.text = txt
except ValueError:
print(txt)
exit()
if currentlevel == level:
parent = parent.getparent()
elif currentlevel < level:
......@@ -55,7 +67,8 @@ def read2xml(read: list, lexattrs: dict = {}) -> etree._ElementTree:
def parse_line(rida: str):
if rida.startswith('#'):
realist = [rida]
realist = [x.strip() for x in rida.strip().split(maxsplit=2)]
else:
realist = [x.strip() for x in rida.strip().split(maxsplit=2)]
return realist
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment