user3511310
user3511310

Reputation: 31

Excel VBA Macro: Scraping data from site table that spans multiple pages

Thanks in advance for the help. I'm running Windows 8.1, I have the latest IE / Chrome browsers, and the latest Excel. I'm trying to write an Excel Macro that pulls data from StackOverflow (https://stackoverflow.com/tags). Specifically, I'm trying to pull the date (that the macro is run), the tag names, the # of tags, and the brief description of what the tag is. I have it working for the first page of the table, but not for the rest (there are 1132 pages at the moment). Right now, it overwrites the data everytime I run the macro, and I'm not sure how to make it look for the next empty cell before running.. Lastly, I'm trying to make it run automatically once per week.

I'd much appreciate any help here. Problems are:

  1. Pulling data from the web table beyond the first page
  2. Making it scrape data to the next empty row rather than overwriting
  3. Making the Macro run automatically once per week

Code (so far) is below. Thanks!

Enum READYSTATE
READYSTATE_UNINITIALIZED = 0
READYSTATE_LOADING = 1
READYSTATE_LOADED = 2
READYSTATE_INTERACTIVE = 3
READYSTATE_COMPLETE = 4
End Enum

Sub ImportStackOverflowData()
  'to refer to the running copy of Internet Explorer
  Dim ie As InternetExplorer
  'to refer to the HTML document returned
  Dim html As HTMLDocument
  'open Internet Explorer in memory, and go to website
  Set ie = New InternetExplorer
  ie.Visible = False
  ie.navigate "http://stackoverflow.com/tags"

  'Wait until IE is done loading page
  Do While ie.READYSTATE <> READYSTATE_COMPLETE
    Application.StatusBar = "Trying to go to StackOverflow ..."
    DoEvents
  Loop

  'show text of HTML document returned
  Set html = ie.document

  'close down IE and reset status bar
  Set ie = Nothing
  Application.StatusBar = ""

  'clear old data out and put titles in
  'Cells.Clear
  'put heading across the top of row 3
  Range("A3").Value = "Date Pulled"
  Range("B3").Value = "Keyword"
  Range("C3").Value = "# Of Tags"
  'Range("C3").Value = "Asked This Week"
  Range("D3").Value = "Description"

  Dim TagList As IHTMLElement
  Dim Tags As IHTMLElementCollection
  Dim Tag As IHTMLElement
  Dim RowNumber As Long
  Dim TagFields As IHTMLElementCollection
  Dim TagField As IHTMLElement
  Dim Keyword As String
  Dim NumberOfTags As String
  'Dim AskedThisWeek As String
  Dim TagDescription As String
  'Dim QuestionFieldLinks As IHTMLElementCollection
  Dim TodaysDate As Date

  Set TagList = html.getElementById("tags-browser")
  Set Tags = html.getElementsByClassName("tag-cell")
  RowNumber = 4

  For Each Tag In Tags
    'if this is the tag containing the details, process it
    If Tag.className = "tag-cell" Then
      'get a list of all of the parts of this question,
      'and loop over them
      Set TagFields = Tag.all

      For Each TagField In TagFields
        'if this is the keyword, store it
        If TagField.className = "post-tag" Then
          'store the text value
          Keyword = TagField.innerText
          Cells(RowNumber, 2).Value = TagField.innerText
        End If

        If TagField.className = "item-multiplier-count" Then
          'store the integer for number of tags
          NumberOfTags = TagField.innerText
          'NumberOfTags = Replace(NumberOfTags, "x", "")
          Cells(RowNumber, 3).Value = Trim(NumberOfTags)
        End If

        If TagField.className = "excerpt" Then
          Description = TagField.innerText
          Cells(RowNumber, 4).Value = TagField.innerText
        End If

        TodaysDate = Format(Now, "MM/dd/yy")
        Cells(RowNumber, 1).Value = TodaysDate

      Next TagField

      'go on to next row of worksheet
      RowNumber = RowNumber + 1
    End If
  Next

  Set html = Nothing

  'do some final formatting
  Range("A3").CurrentRegion.WrapText = False
  Range("A3").CurrentRegion.EntireColumn.AutoFit
  Range("A1:C1").EntireColumn.HorizontalAlignment = xlCenter
  Range("A1:D1").Merge
  Range("A1").Value = "StackOverflow Tag Trends"
  Range("A1").Font.Bold = True
  Application.StatusBar = ""
  MsgBox "Done!"
End Sub

Upvotes: 3

Views: 5211

Answers (3)

QHarr
QHarr

Reputation: 84465

You can improve this to parse out exact elements but it loops all the pages and grabs all the tag info (everything next to a tag)

Option Explicit

Public Sub ImportStackOverflowData()

    Dim ie As New InternetExplorer, html As HTMLDocument

    Application.ScreenUpdating = False
    With ie
        .Visible = True

        .navigate "https://stackoverflow.com/tags"

        While .Busy Or .READYSTATE < 4: DoEvents: Wend

        Set html = .document
        Dim numPages As Long, i As Long, info As Object, item As Object, counter As Long
        numPages = html.querySelector(".page-numbers.dots ~ a").innerText

        For i = 1 To 2  ' numPages ''<==1 to 2 for testing; use to numPages
            DoEvents
            Set info = html.getElementById("tags_list")
            For Each item In info.getElementsByClassName("grid-layout--cell tag-cell")
                counter = counter + 1
                Cells(counter, 1) = item.innerText
            Next item
            html.querySelector(".page-numbers.next").Click
            While .Busy Or .READYSTATE < 4: DoEvents: Wend
            Set html = .document
        Next i
        Application.ScreenUpdating = True
        .Quit '<== Remember to quit application
    End With
End Sub

Upvotes: 1

TCN
TCN

Reputation: 1651

I'm not making use of the DOM, but I find it very easy to get around just searching between known tags. If ever the expressions you are looking for are too common just tweak the code a bit so that it looks for a string after a string).

An example:

Public Sub ZipLookUp()
Dim URL As String, xmlHTTP As Object, html As Object, htmlResponse As String
Dim SStr As String, EStr As String, EndS As Integer, StartS As Integer
Dim Zip4Digit As String

    URL = "https://tools.usps.com/go/ZipLookupResultsAction!input.action?resultMode=1&companyName=&address1=1642+Harmon+Street&address2=&city=Berkeley&state=CA&urbanCode=&postalCode=&zip=94703"
    Set xmlHTTP = CreateObject("MSXML2.XMLHTTP")
    xmlHTTP.Open "GET", URL, False
    On Error GoTo NoConnect
    xmlHTTP.send
    On Error GoTo 0
    Set html = CreateObject("htmlfile")
    htmlResponse = xmlHTTP.ResponseText
    If htmlResponse = Null Then
        MsgBox ("Aborted Run - HTML response was null")
        Application.ScreenUpdating = True
        GoTo End_Prog
    End If

    'Searching for a string within 2 strings
    SStr = "<span class=""address1 range"">" ' first string
    EStr = "</span><br />"                   ' second string
    StartS = InStr(1, htmlResponse, SStr, vbTextCompare) + Len(SStr)
    EndS = InStr(StartS, htmlResponse, EStr, vbTextCompare)
    Zip4Digit = Left(Mid(htmlResponse, StartS, EndS - StartS), 4)

    MsgBox Zip4Digit

GoTo End_Prog
NoConnect:
    If Err = -2147467259 Or Err = -2146697211 Then MsgBox "Error - No Connection": GoTo End_Prog 'MsgBox Err & ": " & Error(Err)
End_Prog:
End Sub

Upvotes: 0

barrowc
barrowc

Reputation: 10679

There's no need to scrape Stack Overflow when they make the underlying data available to you through things like the Data Explorer. Using this query in the Data Explorer should get you the results you need:

select t.TagName, t.Count, p.Body
 from Tags t inner join Posts p
 on t.ExcerptPostId = p.Id
 order by t.count desc;

The permalink to that query is here and the "Download CSV" option which appears after the query runs is probably the easiest way to get the data into Excel. If you wanted to automate that part of things, the direct link to the CSV download of results is here

Upvotes: 1

Related Questions