Fla-Hyd
Fla-Hyd

Reputation: 279

looping over multiple patterns in python regular expressions

Hi i have an input file in the below format.

    .....
    ......

    <TABLE COLS="3">
             <ROW>
                <R>data</R>
                <R>data</R>   
              </ROW>
              <ROW>
                <R>data</R>
                <R>data</R>
                <R>data</R>
              </ROW>
    </TABLE>
    <TABLE COLS="4">
             <ROW>
                <R>data</R>
                <R>data</R>
                <R>data</R>
                <R>data</R>
                <R>data</R>   
              </ROW>
              <ROW>
                <R>data</R>
                <R>data</R>
              </ROW>
    </TABLE> 
    .......
    .....
    .
    ...

The output file should be as :

....
....
.
..

<table ct="3">
<ent="1">
<ent="2">
<ent="3">

         <row>
            <rvn ="1">data</rvn>
            <rvn ="2">data</rvn>  
          </row>
          <row>
            <rvn ="1">data</rvn>
            <rvn ="2">data</rvn>
            <rvn ="3">data</rvn>  
          </row>
</table>
<table ct="4">
<ent="1">
<ent="2">
<ent="3">
<ent="4">
         <row>
            <rvn ="1">data</rvn>
            <rvn ="2">data</rvn> 
            <rvn ="3">data</rvn> 
            <rvn ="4">data</rvn>
            <rvn ="5">data</rvn>
          </row>
          <row>
            <rvn ="1">data</rvn>
            <rvn ="2">data</rvn>
          </row>
</table>
...
...
...

i have writen the below code: when i run this code the table col value is being replaced by the last table col value. and also i am facing problem in incrementing the <rvn> value. can any one of you please help me to solve the problem.

    import re

    def tblcnv( st, val ):
        Tcolspec = ''
        Endval = int(val) + 1
        for i in range(1, Endval):
            l = str(i)
            Tcolspec += "<colspec col='" + l + "' colwidth=''/>\n"
            Theader = re.sub(r"(?i)<table.*?>","<table ct='" + val +"'>\n" + Tcolspec + "\n", st)
        return Theader

    in_data = open("in.txt", "r")
    out_data = open("out.txt", "w")
    Rdata = in_data.read()
    Rval = Rdata.replace("\n", " ")

    Rval = re.sub("(?i)(<TABLE.*cols=\"(\d+).*?</TABLE>)", lambda m: tblcnv(m.group(1), m.group(2)), Rval)
    out_data.write(Rval)

Upvotes: 1

Views: 145

Answers (2)

ATOzTOA
ATOzTOA

Reputation: 35950

Here is your working code...

Note: You should not use regex for this... parsing is always the better way...

import re

counter = None

def datacnv( st ):
    global counter
    return "<rvn=\""+ next(counter) +"\">" + st + "</rvn>\n"

def rowcnv( st ):
    global counter

    counter = iter("".join([str(x) for x in range(1,10)]))

    st = re.sub("(?i)<R>(.*?)</R>", lambda m: datacnv(m.group(1)), st)

    return "<row>\n" + st + "</row>\n"

def tblcnv( st, val ):
    Tcolspec = ''
    Endval = int(val) + 1
    for i in range(1, Endval):
        l = str(i)
        Tcolspec += "<colspec col='" + l + "' colwidth=''/>\n"
    Theader = re.sub(r"(?i)<table.*?>","\n<table ct='" + val +"'>\n" + Tcolspec + "\n", st)

    Theader = re.sub("(?i)<ROW>(.*?)</ROW>", lambda m: rowcnv(m.group(1)), Theader)

    return Theader

in_data = open("in.txt", "r")
out_data = open("out.txt", "w")
Rdata = in_data.read().lower()
in_data.close()
Rval = Rdata.replace("\n", " ")

Rval = re.sub("(?i)(<TABLE.*?cols=\"(\d+).*?</TABLE>)", lambda m: tblcnv(m.group(1), m.group(2)), Rval)
out_data.write(Rval)

out_data.close()

Output

<table ct='3'>
<colspec col='1' colwidth=''/>
<colspec col='2' colwidth=''/>
<colspec col='3' colwidth=''/>

              <row>
                 <rvn="1">data</rvn>
                 <rvn="2">data</rvn>
                  </row>
               <row>
                 <rvn="1">data</rvn>
                 <rvn="2">data</rvn>
                 <rvn="3">data</rvn>
               </row>
     </table>     
<table ct='4'>
<colspec col='1' colwidth=''/>
<colspec col='2' colwidth=''/>
<colspec col='3' colwidth=''/>
<colspec col='4' colwidth=''/>

              <row>
                 <rvn="1">data</rvn>
                 <rvn="2">data</rvn>
                 <rvn="3">data</rvn>
                 <rvn="4">data</rvn>
                 <rvn="5">data</rvn>
                  </row>
               <row>
                 <rvn="1">data</rvn>
                 <rvn="2">data</rvn>
               </row>
     </table>  

Upvotes: 1

unutbu
unutbu

Reputation: 879561

Using an HTML/XML parser is an easier and less error-prone way of manipulating HTML/XML.

It is easier because the parser lets you deal with higher-level concepts: tags and attributes instead of regex on arbitrary strings.


Here is an example using lxml:

import lxml.etree as ET
import itertools as IT

content = '''\
<root>
<TABLE COLS="3">
         <ROW>
            <R>data</R>
            <R>data</R>   
          </ROW>
          <ROW>
            <R>data</R>
            <R>data</R>
            <R>data</R>
          </ROW>
</TABLE>
<TABLE COLS="4">
         <ROW>
            <R>data</R>
            <R>data</R>
            <R>data</R>
            <R>data</R>
            <R>data</R>   
          </ROW>
          <ROW>
            <R>data</R>
            <R>data</R>
          </ROW>
</TABLE>
</root>
'''

root = ET.fromstring(content)
for elt in root.iter():
    elt.tag = elt.tag.lower()
    if elt.tag == 'table':
        elt.attrib['ct'] = elt.attrib['COLS']
        del elt.attrib['COLS']
        # Add <ent> tags
        for i in range(int(elt.attrib['ct']), 0, -1):
            elt.insert(0, ET.Element('ent', value=str(i)))
    # Restart count every time <row> is encountered        
    if elt.tag == 'row':
        count = IT.count(1)
    # Change <R> to <rvn>    
    if elt.tag == 'r':
        elt.tag = 'rvn'
        elt.attrib['value'] = str(next(count))
print(ET.tostring(root, pretty_print = True))

yields

<root>
<table ct="3">
         <ent value="1"/><ent value="2"/><ent value="3"/><row>
            <rvn value="1">data</rvn>
            <rvn value="2">data</rvn>   
          </row>
          <row>
            <rvn value="1">data</rvn>
            <rvn value="2">data</rvn>
            <rvn value="3">data</rvn>
          </row>
</table>
<table ct="4">
         <ent value="1"/><ent value="2"/><ent value="3"/><ent value="4"/><row>
            <rvn value="1">data</rvn>
            <rvn value="2">data</rvn>
            <rvn value="3">data</rvn>
            <rvn value="4">data</rvn>
            <rvn value="5">data</rvn>   
          </row>
          <row>
            <rvn value="1">data</rvn>
            <rvn value="2">data</rvn>
          </row>
</table>
</root>

Upvotes: 1

Related Questions