Set a multichoice regex to make its matching attempts always from left to right, no matter if another previous regex tries to capture more chars?

Question

import re

input_text = 'el dia corrimos juntas hasta el 11° nivel de aquella montaña hasta el 2022_-_12_-_13' 
#input_text = 'desde el  corrimos juntas hasta el 11° nivel de aquella montaña y luego bajamos hasta la salida, hasta el 2022_-_12_-_01 21:00 hs caminamos juntas' #example 2


date_format = r"(?:$|)\s*(\d*)_-_(\d{2})_-_(\d{2})\s*(?:$|)"

#text in the middle associated with the date range...
#some_text = r"(?:(?!\.\s*?\n)[^;])*" #but cannot contain ";", ".\s*\n"
some_text = r"(?:(?!\.\s*)[^;])*" #but cannot contain ";", ".\s*"
#some_text = r"(?:[^.;])*" #but cannot contain ";", "."

identification_re_0 = r"(?:el dia|dia|el)\s*(?:del|de\s*el|de |)\s*(" + some_text + r")\s*(?:,\s*hasta|hasta|al|a )\s*(?:el|la|)\s*" + date_format

input_text = re.sub(identification_re_0,
                    lambda m: print(m[1]),
                    input_text, re.IGNORECASE)

#print(repr(input_text)) # --> output

These are the incorrect outputs that I got:

'corrimos juntas hasta el 11° nivel de aquella montaña hast'
'corrimos juntas hasta el 11° nivel de aquella montaña y luego bajamos hasta la salida, hast'

And these would be the correct outputs that you should get with this examples:

'corrimos juntas hasta el 11° nivel de aquella montaña'
'corrimos juntas hasta el 11° nivel de aquella montaña y luego bajamos hasta la salida'

Why does the (?:,\s*hasta|hasta|al|a ) capture group try its options backwards? Why is it trying to conform to the greedy behavior of the above regex, in this case (?:(?!\.\s*)[^;])*?

Edit with a possible solution:

I have achieved more or less close results except with example 3 where I could not make it so that if there was not something captured by some_text the () are not placed

import re

input_text = 'desde el 2022_-_12_-_10 corrimos juntas hasta el 11° nivel de aquella montaña hasta el 2022_-_12_-_13' #example 1
#input_text = 'desde el 2022_-_11_-_10 18:30 pm corrimos juntas hasta el 11° nivel de aquella montaña y luego bajamos hasta la salida, hasta el 2022_-_12_-_01 21:00 hs caminamos juntas' #example 2
#input_text = 'desde el 2022_-_11_-_10 18:30 pm hasta el 2022_-_12_-_01 21:00 hs' #example 3

#text in the middle associated with the date range...
#some_text = r"(?:(?!\.\s*?\n)[^;])*" #but cannot contain ";", ".\s*\n"
some_text = r"(?:(?!\.\s*)[^;])*" #but cannot contain ";", ".\s*"
#some_text = r"(?:[^.;])*" #but cannot contain ";", "."

identificate_hours = r"(?:a\s*las|a\s*la|)\s*(?:$|)\s*(\d{1,2}):(\d{1,2})\s*(?:(am)|(pm))\s*(?:$|)" #acepta que no se le indicase el 'am' o el 'pm'
identificate_hours = r"(?:a\s*las|a\s*la|)\s*(?:$|)\s*(\d{1,2}):(\d{1,2})\s*(?:(am)|(pm)|)\s*(?:$|)" #no acepta que no se le indicase el 'am' o el 'pm'

date_format = r"(?:$|)\s*(\d*)_-_(\d{2})_-_(\d{2})\s*(?:$|)"

# (?:,\s*hasta|hasta|al|a )
some_text_limiters = [r",\s*hasta", r"hasta", r"al", r"a "]

for some_text_limiter in some_text_limiters:

    identification_re_0 = r"(?:(?<=\s)|^)(?:desde\s*el|desde|del|de\s*el|de\s*la|de |)\s*(?:día|dia|fecha|)\s*(?:del|de\s*el|de |)\s*" + date_format + r"\s*(?:" + identificate_hours + r"|)\s*(?:\)|)\s*(" + some_text + r")\s*" + some_text_limiter + r"\s*(?:el|la|)\s*(?:fecha|d[íi]a|)\s*(?:del|de\s*el|de|)\s*" + date_format + r"\s*(?:" + identificate_hours + r"|)\s*(?:\)|)"

    input_text = re.sub(identification_re_0,
                        lambda m: (f"({m[1]}_-_{m[2]}_-_({m[3]}({m[4] or '00'}:{m[5] or '00'} {m[6] or m[7] or 'am'})_--_{m[9]}_-_{m[10]}_-_({m[11]}({m[12] or '00'}:{m[13] or '00'} {m[14] or m[15] or 'am'})))({m[8]})").replace(" )", ")").replace("( ", "("),
                        input_text, re.IGNORECASE)


print(repr(input_text))

Ramesh · Accepted Answer

you can validate the date strings and then replace the date strings with symbols(make sure it won't repeat in the text) and extract the text between them.

import re

re_exp = r'((?:hasta el))?\s\d{4}\_\-\_\d{2}\_\-\_\d{2}\s?((?:\d{2}\:\d{2}\s(?:am|pm)?)?)'
input_text = 'desde el 2022_-_12_-_10 corrimos juntas hasta el 11° nivel de aquella montaña hasta el 2022_-_12_-_13'
input_text = 'desde el 2022_-_11_-_10 18:30 pm corrimos juntas hasta el 11° nivel de aquella montaña y ' \
             'luego bajamos hasta la salida, hasta el 2022_-_12_-_01 21:00 hs caminamos juntas'
input_text = "desde el 2022_-_11_-_10 18:30 pm hasta el 2022_-_12_-_01 21:00 hs"
data = re.sub(re_exp, "@*@", input_text)
text_btw_dates = [i.replace('@', '').strip().strip(".,") for i in data.split('*') if
                  i.startswith('@') and i.endswith('@') and len(i) > 1]
print(text_btw_dates)

>>> ['corrimos juntas hasta el 11° nivel de aquella montaña']
>>> ['corrimos juntas hasta el 11° nivel de aquella montaña y luego bajamos hasta la salida']
>>> [""]

Set a multichoice regex to make its matching attempts always from left to right, no matter if another previous regex tries to capture more chars?

Answers (1)

Related Questions