Umair Ayub
Umair Ayub

Reputation: 21361

Extract strings between two strings using Regex

I am scraping a site and I want to extract the JSON that of data variable in following JS code using Python Regex.

<script type="text/javascript">
P.when('A').register("ImageBlockATF", function(A){
    var data = {
                'colorImages': { 'initial': [{"hiRes":"https://images-na.ssl-images-amazon.com/images/I/81Oo79kGp2L._SL1500_.jpg","thumb":"https://images-na.ssl-images-amazon.com/images/I/41SnVVzKChL._SS40_.jpg","large":"https://images-na.ssl-images-amazon.com/images/I/41SnVVzKChL.jpg","main":{"https://images-na.ssl-images-amazon.com/images/I/81Oo79kGp2L._SY355_.jpg":[355,270],"https://images-na.ssl-images-amazon.com/images/I/81Oo79kGp2L._SY450_.jpg":[450,342],"https://images-na.ssl-images-amazon.com/images/I/81Oo79kGp2L._SY550_.jpg":[550,419],"https://images-na.ssl-images-amazon.com/images/I/81Oo79kGp2L._SY606_.jpg":[606,461],"https://images-na.ssl-images-amazon.com/images/I/81Oo79kGp2L._SY679_.jpg":[679,517]},"variant":"MAIN","lowRes":null},{"hiRes":"https://images-na.ssl-images-amazon.com/images/I/81%2BGc-r4gLL._SL1500_.jpg","thumb":"https://images-na.ssl-images-amazon.com/images/I/416rXB0xcmL._SS40_.jpg","large":"https://images-na.ssl-images-amazon.com/images/I/416rXB0xcmL.jpg","main":{"https://images-na.ssl-images-amazon.com/images/I/81%2BGc-r4gLL._SY355_.jpg":[355,276],"https://images-na.ssl-images-amazon.com/images/I/81%2BGc-r4gLL._SY450_.jpg":[450,349],"https://images-na.ssl-images-amazon.com/images/I/81%2BGc-r4gLL._SX425_.jpg":[547,425],"https://images-na.ssl-images-amazon.com/images/I/81%2BGc-r4gLL._SX466_.jpg":[600,466],"https://images-na.ssl-images-amazon.com/images/I/81%2BGc-r4gLL._SX522_.jpg":[672,522]},"variant":"PT01","lowRes":null},{"hiRes":"https://images-na.ssl-images-amazon.com/images/I/817slrgsGbL._SL1500_.jpg","thumb":"https://images-na.ssl-images-amazon.com/images/I/51gQxeLTYhL._SS40_.jpg","large":"https://images-na.ssl-images-amazon.com/images/I/51gQxeLTYhL.jpg","main":{"https://images-na.ssl-images-amazon.com/images/I/817slrgsGbL._SX355_.jpg":[251,355],"https://images-na.ssl-images-amazon.com/images/I/817slrgsGbL._SX450_.jpg":[318,450],"https://images-na.ssl-images-amazon.com/images/I/817slrgsGbL._SX425_.jpg":[300,425],"https://images-na.ssl-images-amazon.com/images/I/817slrgsGbL._SX466_.jpg":[329,466],"https://images-na.ssl-images-amazon.com/images/I/817slrgsGbL._SX522_.jpg":[369,522]},"variant":"PT02","lowRes":null},{"hiRes":"https://images-na.ssl-images-amazon.com/images/I/71r3nXKZBmL._SL1500_.jpg","thumb":"https://images-na.ssl-images-amazon.com/images/I/41d9m8J4MbL._SS40_.jpg","large":"https://images-na.ssl-images-amazon.com/images/I/41d9m8J4MbL.jpg","main":{"https://images-na.ssl-images-amazon.com/images/I/71r3nXKZBmL._SX355_.jpg":[142,355],"https://images-na.ssl-images-amazon.com/images/I/71r3nXKZBmL._SX450_.jpg":[180,450],"https://images-na.ssl-images-amazon.com/images/I/71r3nXKZBmL._SX425_.jpg":[170,425],"https://images-na.ssl-images-amazon.com/images/I/71r3nXKZBmL._SX466_.jpg":[187,466],"https://images-na.ssl-images-amazon.com/images/I/71r3nXKZBmL._SX522_.jpg":[209,522]},"variant":"PT03","lowRes":null},{"hiRes":"https://images-na.ssl-images-amazon.com/images/I/81Uys4ccU4L._SL1500_.jpg","thumb":"https://images-na.ssl-images-amazon.com/images/I/41zh%2BCGamHL._SS40_.jpg","large":"https://images-na.ssl-images-amazon.com/images/I/41zh%2BCGamHL.jpg","main":{"https://images-na.ssl-images-amazon.com/images/I/81Uys4ccU4L._SY355_.jpg":[355,260],"https://images-na.ssl-images-amazon.com/images/I/81Uys4ccU4L._SY450_.jpg":[450,330],"https://images-na.ssl-images-amazon.com/images/I/81Uys4ccU4L._SY550_.jpg":[550,403],"https://images-na.ssl-images-amazon.com/images/I/81Uys4ccU4L._SY606_.jpg":[606,444],"https://images-na.ssl-images-amazon.com/images/I/81Uys4ccU4L._SY679_.jpg":[679,498]},"variant":"PT04","lowRes":null},{"hiRes":"https://images-na.ssl-images-amazon.com/images/I/8179KoLoyGL._SL1500_.jpg","thumb":"https://images-na.ssl-images-amazon.com/images/I/41sMHp-WegL._SS40_.jpg","large":"https://images-na.ssl-images-amazon.com/images/I/41sMHp-WegL.jpg","main":{"https://images-na.ssl-images-amazon.com/images/I/8179KoLoyGL._SY355_.jpg":[355,258],"https://images-na.ssl-images-amazon.com/images/I/8179KoLoyGL._SY450_.jpg":[450,327],"https://images-na.ssl-images-amazon.com/images/I/8179KoLoyGL._SY550_.jpg":[550,400],"https://images-na.ssl-images-amazon.com/images/I/8179KoLoyGL._SY606_.jpg":[606,441],"https://images-na.ssl-images-amazon.com/images/I/8179KoLoyGL._SY679_.jpg":[679,494]},"variant":"PT05","lowRes":null},{"hiRes":"https://images-na.ssl-images-amazon.com/images/I/71Sw2wrvy6L._SL1364_.jpg","thumb":"https://images-na.ssl-images-amazon.com/images/I/416TFrjOFlL._SS40_.jpg","large":"https://images-na.ssl-images-amazon.com/images/I/416TFrjOFlL.jpg","main":{"https://images-na.ssl-images-amazon.com/images/I/71Sw2wrvy6L._SX355_.jpg":[231,355],"https://images-na.ssl-images-amazon.com/images/I/71Sw2wrvy6L._SX450_.jpg":[293,450],"https://images-na.ssl-images-amazon.com/images/I/71Sw2wrvy6L._SX425_.jpg":[277,425],"https://images-na.ssl-images-amazon.com/images/I/71Sw2wrvy6L._SX466_.jpg":[304,466],"https://images-na.ssl-images-amazon.com/images/I/71Sw2wrvy6L._SX522_.jpg":[340,522]},"variant":"PT06","lowRes":null},{"hiRes":"https://images-na.ssl-images-amazon.com/images/I/61GKBhtPKPL._SL1341_.jpg","thumb":"https://images-na.ssl-images-amazon.com/images/I/41%2BNMI0l9yL._SS40_.jpg","large":"https://images-na.ssl-images-amazon.com/images/I/41%2BNMI0l9yL.jpg","main":{"https://images-na.ssl-images-amazon.com/images/I/61GKBhtPKPL._SX355_.jpg":[190,355],"https://images-na.ssl-images-amazon.com/images/I/61GKBhtPKPL._SX450_.jpg":[240,450],"https://images-na.ssl-images-amazon.com/images/I/61GKBhtPKPL._SX425_.jpg":[227,425],"https://images-na.ssl-images-amazon.com/images/I/61GKBhtPKPL._SX466_.jpg":[249,466],"https://images-na.ssl-images-amazon.com/images/I/61GKBhtPKPL._SX522_.jpg":[279,522]},"variant":"PT07","lowRes":null},{"hiRes":null,"thumb":"https://images-na.ssl-images-amazon.com/images/I/41ziorm06nL._SS40_.jpg","large":"https://images-na.ssl-images-amazon.com/images/I/41ziorm06nL.jpg","main":{"https://images-na.ssl-images-amazon.com/images/I/41ziorm06nL._SY355_.jpg":[355,266],"https://images-na.ssl-images-amazon.com/images/I/41ziorm06nL._SY450_.jpg":[450,338],"https://images-na.ssl-images-amazon.com/images/I/41ziorm06nL.jpg":[500,375]},"variant":"AW01","lowRes":null},{"hiRes":null,"thumb":"https://images-na.ssl-images-amazon.com/images/I/41lZ6jtPe%2BL._SS40_.jpg","large":"https://images-na.ssl-images-amazon.com/images/I/41lZ6jtPe%2BL.jpg","main":{"https://images-na.ssl-images-amazon.com/images/I/41lZ6jtPe%2BL._SY355_.jpg":[355,266],"https://images-na.ssl-images-amazon.com/images/I/41lZ6jtPe%2BL._SY450_.jpg":[450,338],"https://images-na.ssl-images-amazon.com/images/I/41lZ6jtPe%2BL.jpg":[500,375]},"variant":"AW02","lowRes":null},{"hiRes":null,"thumb":"https://images-na.ssl-images-amazon.com/images/I/51JqQcNGjUL._SS40_.jpg","large":"https://images-na.ssl-images-amazon.com/images/I/51JqQcNGjUL.jpg","main":{"https://images-na.ssl-images-amazon.com/images/I/51JqQcNGjUL._SY355_.jpg":[355,355],"https://images-na.ssl-images-amazon.com/images/I/51JqQcNGjUL._SY450_.jpg":[450,450],"https://images-na.ssl-images-amazon.com/images/I/51JqQcNGjUL._SX425_.jpg":[425,425],"https://images-na.ssl-images-amazon.com/images/I/51JqQcNGjUL._SX466_.jpg":[466,466],"https://images-na.ssl-images-amazon.com/images/I/51JqQcNGjUL.jpg":[500,500]},"variant":"AW03","lowRes":null}]},
                'colorToAsin': {'initial': {}},
                'holderRatio': 1.0,
                'holderMaxHeight': 700,
                'heroImage': {'initial': []},
                'weblabs' : {}
                };
    A.trigger('P.AboveTheFold'); // trigger ATF event.
    return data;
});
</script>

I have been trying following regex but not working.

(var\s+data\s+=).*^[A.trigger('P.AboveTheFold')]$

Basically I need regex to grab string between var data = and A.trigger('P.AboveTheFold')

Upvotes: 1

Views: 121

Answers (1)

Eric Duminil
Eric Duminil

Reputation: 54303

If you're sure your json data doesn't include any ;, you can write :

var data\s*=\s*([^;]*});

It is not very robust, and you probably should use a parsing library. The json data is inside the 1st group.

See it here.

If you're sure your data is between var data = and A.trigger('P.AboveTheFold'), you can use :

(?<=var data = ).*(?=A.trigger\('P\.AboveTheFold'\))

See it there.

The json data is the complete match, thanks to positive lookarounds. It is also not robust. Any different spacing between data and = would break it for example. You'll need the re.DOTALL flag to tell Python that . should match a newline.

Upvotes: 1

Related Questions