Reputation: 13
I started learning web scraping with Python. Currently, I would like to download a video of the Japanese Diet. (https://www.shugiintv.go.jp/jp/index.php?ex=VL&deli_id=40124&media_type=)
The video seems to have a mechanism to call chunklist.m3u8 from playlist.m3u8 and then call the ts files described in chunklist.m3u8 in order.
I want to download the contents from the playlist.m3u8 URL first, then call chunklist.m3u8 to download the ts files in order and concat.
However, I tried to download Playlist.m3u8, but it didn't produce the text I expected.
Also, the sample URL of playlist.m3u8 is here↓
http://hlsvod.shugiintv.go.jp/vod/_definst_/amlst:2011/2011-1207-0900-12/playlist.m3u8
code:
import requests
url = "http://hlsvod.shugiintv.go.jp/vod/_definst_/amlst:2011/2011-1207-0900-12/playlist.m3u8"
res = requests.get(url)
print(res.text)
excepted text:
#EXTM3U
#EXT-X-VERSION:3
#EXT-X-STREAM-INF:BANDWIDTH=564000,NAME="500k",RESOLUTION=640x360
chunklist_w60346572_b564000_t64NTAwaw==.m3u8
actual text:
<html><head><title>Wowza Streaming Engine 4 Perpetual Bundle Unlimited Edition 4.7.7 build20181108145350</title></head><body>Wowza Streaming Engine 4 Perpetual Bundle Unlimited Edition 4.7.7 build20181108145350</body></html>
I think there is a problem with the colon in the URL, but I don't have a clear solution. I would like to know how to avoid URL issues and successfully download the text in playlist.m3u8. Thanks.
Version:
Python 3.7.9
requests 2.25.1
Upvotes: 1
Views: 9938
Reputation: 31
You can try this to download the m3u8, any encryption keys, and the ts file parts.
import sys
import os
import requests
OUTPUT_FOLDER = 'output' # default output folder name
HEADERS = {
"User-Agent": "Mozilla/5.0 (Windows NT 6.1; Win64; x64; rv:57.0)"
" Gecko/20100101 Firefox/57.0",
"Accept": "*/*",
"Accept-Language": "en-US,en;q=0.5",
"Accept-Encoding": "gzip, deflate, br",
"DNT": "1",
"Connection": "keep-alive",
"Pragma": "no-cache",
"Cache-Control": "no-cache"
}
def downloadM3u8(m3u8url, headers=HEADERS, depth=0):
""" recursively download m3u8 files"""
if not os.path.isdir(OUTPUT_FOLDER):
os.mkdir(OUTPUT_FOLDER)
base_url = '/'.join(m3u8url.split('/')[0:-1]) + '/' # get the base url
print('processing: {}'.format(m3u8url))
m3u8 = requests.get(m3u8url, headers=HEADERS) # get the m3u8 file
folder = m3u8url.split('/')[-2] # get the filename
parent_folder = None
if depth > 0:
parent_folder = m3u8url.split('/')[-3]
filename = m3u8url.split('/')[-1].split('?')[0] # get the filename
path_parts = list(filter(lambda x : x is not None, [
OUTPUT_FOLDER,
parent_folder,
folder,
]))
target_path = os.path.join(*path_parts, filename)
if not os.path.isdir(os.path.join(*path_parts)):
os.mkdir(os.path.join(*path_parts))
with open(target_path, 'wb') as f:
print('writing file to {}'.format(target_path))
f.write(m3u8.content)
# Download encrypted key files
key_urls = extractKeyUrls(m3u8)
print('key_urls', key_urls)
for key_url in key_urls:
key_filename = key_url.split('/')[-1].split('?')[0]
key_file = requests.get(base_url + key_url, headers=HEADERS)
with open(os.path.join(*path_parts, key_filename), 'wb') as f:
f.write(key_file.content)
ts_urls = extractTsUrls(m3u8) # get all the .ts urls
print('ts_urls', ts_urls)
# list the .ts files if they exist in the dir
# list contents of the directory
ts_target_dir = os.path.join(*path_parts)
ts_files = set(filter(lambda x: '.ts' in x, os.listdir(ts_target_dir)))
print('all ts files existing: {}'.format(ts_files))
if len(ts_files) > 0:
ts_urls = list(filter(lambda x: x.split('?')[0] not in ts_files, ts_urls))
for ts in ts_urls:
ts_url = base_url + ts
print('downloading: {}'.format(ts_url))
ts_filename = ts.split('?')[0]
ts_file = requests.get(ts_url, headers=HEADERS)
with open(os.path.join(*path_parts, ts_filename), 'wb') as f:
f.write(ts_file.content)
child_urls = extractM3u8Urls(m3u8) # get all the urls in the m3u8 file
all_urls = []
print('child_urls', child_urls)
for child in child_urls:
new_url = base_url + child
all_urls.append(new_url)
subchildren = downloadM3u8(new_url, headers=HEADERS, depth=depth + 1)
print('subchildren', subchildren)
all_urls.extend(subchildren)
return all_urls
def extractTsUrls(m3):
""" get a list of .ts urls from the m3u8 file """
lines = m3.text.split('\n')
urls = []
for line in lines:
if '.ts' in line:
urls.append(line)
return urls
def extractM3u8Urls(m3):
""" get a list of m3u8 urls from the m3u8 file """
lines = m3.text.split('\n')
urls = []
for line in lines:
if '.m3u8' in line:
urls.append(line)
return urls
def extractKeyUrls(m3):
""" get a list of key urls from the m3u8 file """
lines = m3.text.split('\n')
urls = []
for line in lines:
match = re.search(r'URI="([^"]+)"', line)
if match:
urls.append(match.group(1))
return urls
if __name__ == "__main__":
downloadM3u8(sys.argv[0], headers=HEADERS)
print('done')
Upvotes: 0
Reputation: 989
Something is wrong with your url:
>>> url = "http://hlsvod.shugiintv.go.jp/vod/_definst_/amlst:2011/2011-1207-0900-12/playlist.m3u8"
>>> res = requests.get(url)
>>> res.request.url
'https://hlsvod.shugiintv.go.jp/vod/_definst_/amlst:2011/2011-1207-0900-12/playlist.m3u8%20'
See the "%20" in the end?
I am not really sure how you got it wrong, but copy-paste this should work:
url = 'https://hlsvod.shugiintv.go.jp/vod/_definst_/amlst:2011/2011-1207-0900-12/playlist.m3u8'
Upvotes: 1