Pulling Vuln details for WPscan bot

Question

I am working on a slack bot that will act as an automatic WPscan tool to check our websites that use WP to see if there is a vulnerability whenever an article mentioning WP pops up in our RSS feed. I am struggling to get it tuned just right so that it can automatically find and detect the plugin name for the keyword WP scan when it runs. I am also running into issues when pulling from the NVD in the form of api rate limiting. I tried to work around this using the Github Vuln Database with the NVD, but the method I was using to pull from the NVD doesn't work because github uses its own vuln number system. I'm a security guy, not a dev or a coder, so I may be missing something as I've been working on building this. The main part I've been troubleshooting is the 'ArticleParser' class in the code which I've provided below. I can post the full code if it helps, as I have my personal API keys hosted in a separate file for when/if I choose to post the code publicly on GitHub since it could be a useful automation for others. I just can't seem to solve the issue of pulling the info I want from the articles and/or the CVE reports. Any suggestions?

Code:

class ArticleParser:
    def __init__(self, github_token):
        self.common_article_selectors = [
            'article', 
            '.post-content', 
            '.entry-content',
            'main',
            '#content'
        ]
        self.github_headers = {
            'Accept': 'application/vnd.github.v3+json',
            'Authorization': f'token {github_token}'
        }

    def format_error_message(self, url, error):
        base_message = f"⚠️ Unable to parse article from {url}
"
        solutions = ["Try manual keyword scan with /scan [plugin name]"]

        if isinstance(error, requests.exceptions.SSLError):
            solutions.append("Site's security certificate might be invalid")
        elif isinstance(error, requests.exceptions.ConnectionError):
            solutions.append("Check if the site is accessible")
        elif isinstance(error, requests.exceptions.Timeout):
            solutions.append("Site might be temporarily slow or unavailable")
        elif isinstance(error, requests.exceptions.HTTPError):
            if error.response.status_code == 403:
                solutions.append("Article might be behind a paywall")
            elif error.response.status_code == 404:
                solutions.append("Article might have been removed")

        return base_message + "
Possible solutions:
" + "
".join(f"- {s}" for s in solutions)

    def extract_article_info(self, url):
        scraper_logger.info(f"Starting article extraction for: {url}")
        try:
            response = http_session.get(url)
            soup = BeautifulSoup(response.text, 'html.parser')
            
            content = None
            for selector in self.common_article_selectors:
                element = soup.select_one(selector)
                if element:
                    content = element.get_text()
                    break
            
            if not content:
                scraper_logger.warning(f"Could not find article content with common selectors: {url}")
                content = soup.get_text()

            vuln_info = self.extract_vulnerability_info(content)
            debug_logger.debug(f"Extracted vulnerability info: {vuln_info}")

            # Only fall back to article content plugin detection if we didn't get keywords from vuln source
            plugin_info = (
                {'plugin_name': ' '.join(vuln_info['plugin_keywords']), 'version': None} 
                if vuln_info['plugin_keywords'] 
                else self.extract_plugin_info(content)
            )

            debug_logger.debug(f"Extracted plugin info: {plugin_info}")

            return {
                **plugin_info,
                'vulnerability': vuln_info['description'] if vuln_info['description'] else "No vulnerability details found"
            }

        except Exception as e:
            error_message = self.format_error_message(url, e)
            scraper_logger.error(f"Article parsing error: {str(e)}", exc_info=True)
            return {'error': error_message}

    def extract_plugin_info(self, content):
        plugin_patterns = [
            r'(?i)"([^"]+)"\s+(?:wordpress\s+)?plugin',
            r'(?i)plugin\s+(?:called|named)\s+"([^"]+)"',
            r'(?i)(?:wordpress\s+)?plugin\s+([A-Za-z0-9\s-]+?)\s+(?:is|has|contains)',
            r'(?i)([A-Za-z0-9\s-]+?)\s+is\s+a\s+(?:wordpress\s+)?plugin',
            r'(?i)vulnerability\s+in\s+(?:the\s+)?([A-Za-z0-9\s-]+?)\s+plugin'
        ]
        
        version_patterns = [
            r'version\s*((?:\d+\.)?(?:\d+\.)?(?:\*|\d+))',
            r'v((?:\d+\.)?(?:\d+\.)?(?:\*|\d+))',
            r'affected\s+versions?\s*:?\s*((?:\d+\.)?(?:\d+\.)?(?:\*|\d+))'
        ]

        plugin_name = None
        version = None

        for pattern in plugin_patterns:
            match = re.search(pattern, content, re.I)
            if match:
                candidate = match.group(1).strip()
                if len(candidate.split()) <= 5 and not any(term in candidate.lower() for term in ['bug', 'issue', 'vulnerability']):
                    plugin_name = candidate
                    break

        for pattern in version_patterns:
            match = re.search(pattern, content, re.I)
            if match:
                version = match.group(1)
                break

        return {
            'plugin_name': plugin_name,
            'version': version
        }

    def extract_vulnerability_info(self, content):
        cve_pattern = r'CVE-\d{4}-\d{4,7}'
        cve_match = re.search(cve_pattern, content)
        
        vuln_info = {
            'description': None,
            'source': None,
            'plugin_keywords': []
        }
        
        if not cve_match:
            vuln_patterns = [
                r'vulnerability[^.]*allows[^.]*\.',
                r'security (issue|flaw)[^.]*\.',
                r'(?:plugin|theme)[^.]*vulnerable[^.]*\.'
            ]
            for pattern in vuln_patterns:
                match = re.search(pattern, content, re.I)
                if match:
                    scraper_logger.info("Vulnerability info extracted from article content")
                    vuln_info['description'] = match.group(0).strip()
                    vuln_info['source'] = 'article'
                    return vuln_info
            return vuln_info
            
        cve_number = cve_match.group(0)
        
        try:
            # Try GitHub Advisory Database
            scraper_logger.info(f"Attempting to fetch CVE data from GitHub Advisory Database for {cve_number}")
            search_url = f"https://api.github.com/search/advisories?q={cve_number}"
            response = requests.get(search_url, headers=self.github_headers)
            
            if response.status_code == HTTPStatus.OK:
                data = response.json()
                if data.get('items') and len(data['items']) > 0:
                    advisory = data['items'][0]
                    ghsa_id = advisory.get('ghsa_id')
                    if ghsa_id:
                        advisory_url = f"https://api.github.com/advisories/{ghsa_id}"
                        advisory_response = requests.get(advisory_url, headers=self.github_headers)
                        if advisory_response.status_code == HTTPStatus.OK:
                            advisory_data = advisory_response.json()
                            description = advisory_data.get('summary')
                            if description:
                                scraper_logger.info(f"Successfully retrieved vulnerability info from GitHub (GHSA: {ghsa_id})")
                                vuln_info['description'] = f"{cve_number}: {description}"
                                vuln_info['source'] = 'github'
                                # Extract first few words as potential plugin name
                                title_words = description.split()[:5]
                                vuln_info['plugin_keywords'] = [word.lower() for word in title_words 
                                                              if len(word) > 3 and word.lower() not in 
                                                              ['wordpress', 'the', 'and', 'for', 'with', 'plugin']]
                                return vuln_info
                                
            # If GitHub fails, try NVD API
            scraper_logger.info(f"Attempting to fetch CVE data from NVD API for {cve_number}")
            time.sleep(2)
            nvd_url = f"https://services.nvd.nist.gov/rest/json/cves/2.0?cveId={cve_number}"
            response = http_session.get(nvd_url)
            data = response.json() if response.status_code == 200 else {}
            
            if data and isinstance(data, dict):
                vulns = data.get('vulnerabilities', [])
                if vulns and len(vulns) > 0:
                    vuln = vulns[0]
                    if vuln and isinstance(vuln, dict):
                        cve_data = vuln.get('cve', {})
                        if cve_data and isinstance(cve_data, dict):
                            descriptions = [d.get('value', '') for d in cve_data.get('descriptions', [])
                                         if d.get('lang') == 'en' and len(d.get('value', '')) > 50]
                            if descriptions:
                                description = max(descriptions, key=len)
                                scraper_logger.info("Successfully retrieved vulnerability info from NVD API")
                                vuln_info['description'] = f"{cve_number}: {description}"
                                vuln_info['source'] = 'nvd'
                                # Extract first few words as potential plugin name
                                title_words = description.split()[:5]
                                vuln_info['plugin_keywords'] = [word.lower() for word in title_words 
                                                              if len(word) > 3 and word.lower() not in 
                                                              ['wordpress', 'the', 'and', 'for', 'with', 'plugin']]
                                return vuln_info
                    
        except Exception as e:
            scraper_logger.error(f"Error fetching CVE details: {str(e)}")
            # Extract from article as fallback
            vuln_patterns = [
                r'vulnerability[^.!?]*[.!?]',
                r'security (?:issue|flaw)[^.!?]*[.!?]',
                r'(?:plugin|theme)[^.!?]*vulnerable[^.!?]*[.!?]'
            ]
            full_description = []
            for pattern in vuln_patterns:
                matches = re.finditer(pattern, content, re.I)
                for match in matches:
                    full_description.append(match.group(0).strip())
            
            if full_description:
                scraper_logger.info("Using article content as fallback for vulnerability description")
                vuln_info['description'] = f"{cve_number}: {' '.join(full_description)}"
                vuln_info['source'] = 'article'
                return vuln_info
        
        return vuln_info

I've tried using the NVD api to pull the plugin name and summary, and that worked for a time, however in testing I've been rate limited due to testing my changes frequently.

I have tried switching to pulling from the Github Advisory Database, but the method I was using (www.vulndatabasesite.com/whatever-cve-number) doesn't work with it due to the GAV using a different numbering scheme. The CVE's can be searched on there and the pages are named after the CVE numbers, but the actual URL uses GitHub's own numbering scheme.

Pulling Vuln details for WPscan bot

Answers (0)

Related Questions