how to scrape data on a website with view more with beautifulsoup

i am trying to scrape news from reuters but there is a click to view more at the bottom on the website. I could not know how to load the hidden results by using beautiful soup.

from bs4 import BeautifulSoup
import urllib.request

def scrape_reuters_news(ticker):
    url = "https://www.reuters.com/search/news?sortBy=relevance&dateRange=pastWeek&blob="+ticker
    scraped_data = urllib.request.urlopen(url)
    scraped_data = scraped_data.read()
    parsed_articles = BeautifulSoup(scraped_data, 'lxml')
    links = parsed_articles.find_all("h3")
    
    articles = []
    titles = []
    title_class = "Text__text___3eVx1j Text__dark-grey___AS2I_p Text__medium___1ocDap Text__heading_2___sUlNJP Heading__base___1dDlXY Heading__heading_2___3f_bIW ArticleHeader__heading___3ibi0Q"
    
    for link in links:
        paragraphs = ""
        url = "https://www.reuters.com/"+str(link)[41:63]
        scraped_data = urllib.request.urlopen(url)
        scraped_data = scraped_data.read()
        parsed_article = BeautifulSoup(scraped_data, 'lxml')
        article = parsed_article.find_all("p")
        title = parsed_article.select("h1", {"class": title_class})
        titles.append(title[0].text.strip())
        
        for paragraph in article:
            paragraphs +=  paragraph.text + " "
            
        articles.append(paragraphs)
        
    return titles, articles

# edit
ticker = "apple"
news = scrape_reuters_news(ticker)

Answer

When you click the load more a callback is issued that you can find in the network tab. If you grab the number of results from the search page, you can add this into the callback to get all results in one go. I then use regex to extract the id to reconstruct each detail page url and the title (headline)

You would then visit each link to get the paragraph info.

Please note:

  1. There is some de-duplication work to do. There exist different ids which lead to same content. So perhaps exclude based on title?
  2. You may need to consider whether any pre-processing of ticker needs to happen e.g. convert to lowercase, replace spaces with “-“. I don’t know all your use cases.

from bs4 import BeautifulSoup as bs
import requests, re

ticker = 'apple'

with requests.Session() as s:
    r = s.get(f'https://www.reuters.com/search/news?sortBy=relevance&dateRange=pastWeek&blob={ticker}')
    soup = bs(r.content, 'lxml')
    num_results = soup.select_one('.search-result-count-num').text
    r = s.get(f'https://www.reuters.com/assets/searchArticleLoadMoreJson?blob={ticker}&bigOrSmall=big&articleWithBlog=true&sortBy=relevance&dateRange=pastWeek&numResultsToShow={num_results}&pn=&callback=addMoreNewsResults')
    p = re.compile(r'id: "(.*?)"')
    p2 = re.compile(r'headline: "(.*?)"')
    links = [f'https://www.reuters.com/article/id{i}' for i in p.findall(r.text)]
    headlines = [bs(i, 'lxml').get_text() for i in p2.findall(r.text)]

print(len(links), len(headlines))

From the detail pages you can get the paragraphs with

paras = ' '.join([i.get_text() for i in soup.select('[data-testid*=paragraph-]')])