Reputation: 1808
so basically I am trying to get all the information of a webpage via selenium but it gets slower over time... to the point where it won't finish list of people and will just timeout.
I understand that it should get slower over time as I am keeping a HashSet of ids that have already been accounted for and check every loop to see if the id occurred previously.
I've attached a bunch of code showing how it pulls the data off the site but I doubt how is the main problem. I think I am over looking something or have some sort of resource leak or selenium limitation...
So if I start the webmanager in quiet mode it doesn't mess up until over 120 loops if I start it non-quiet with normal chromedriver it eventually messes up and throws and error and skips people... I assume because I touched the webpage when it was processing or something.
All other issues aside,
WebManager class:
public WebManager(string website)
{
driver = new ChromeDriver();
driver.Navigate().GoToUrl(website);
}
public WebManager(Boolean quiet)
{
if (!quiet)
driver = new ChromeDriver();
else
{
var processInfo = new ProcessStartInfo("java.exe", "-jar quietserver.jar")
{
CreateNoWindow = true,
UseShellExecute = false
};
quietServer = Process.Start(processInfo);
driver = new RemoteWebDriver(DesiredCapabilities.HtmlUnit());
}
}
The main process of the program:
public void doScrape()
{
int fileCount = Directory.GetDirectories(utils.savePath).Length;
int startCounty = (fileCount == 0 ? 1 : fileCount);
string lastOffenderId = null;
if (fileCount > 4 && localScrape)
{
Console.WriteLine("Please clear storage folders...");
Console.Read();
Environment.Exit(1);
}
webManager = new WebManager(quiet);
for (int i = (localScrape ? 0 : startCounty); i <= (localScrape ? 2 : 64); i++)
{
webManager.driver.Navigate().GoToUrl(getOffenderListURL((localScrape ? localCounties[i] : i)));
HashSet<string> completedList = new HashSet<string>();
string locationStr = webManager.getElementByxPath(countyxPath).Text;
Console.WriteLine("Working on county: " + locationStr.Substring(locationStr.IndexOf(':') + 2));
locationStr = locationStr.Substring(locationStr.IndexOf(':') + 2);
for (int l = 2; l < 10000; l++)
{
try
{
var element1 = new WebDriverWait(webManager.driver, TimeSpan.FromSeconds(5)).Until(ExpectedConditions.ElementExists((By.XPath(getOffenderxPath(l)))));
string linkToOffender = element1.GetAttribute("href");
string offenderId = linkToOffender.Substring(linkToOffender.IndexOf('=') + 1);
if (completedList.Contains(offenderId))
{
Console.WriteLine("Offender id " + offenderId + " has multiple aliases one of which is: " + element1.Text);
continue;
}
lastOffenderId = offenderId;
element1.Click();
var currentPlacement = webManager.getElementTextByxPath(currentPlacementxPath);
var lastName = webManager.getElementTextByxPath(getOffenderInfoBasic(1, 2));
var firstName = webManager.getElementTextByxPath(getOffenderInfoBasic(1, 3));
var middleName = webManager.getElementTextByxPath(getOffenderInfoBasic(1, 4));
var dob = webManager.getElementTextByxPath(getOffenderInfoBasic(1, 5));
var sex = webManager.getElementTextByxPath(getOffenderInfoBasic(1, 6));
var riskLevel = webManager.getElementTextByxPath(getOffenderInfoBasic(1, 7));
var designation = webManager.getElementTextByxPath(getOffenderInfoBasic(1, 8));
Console.WriteLine("Offender info: " + currentPlacement + " " + lastName + " " + firstName + " " + middleName + " " + dob + " " + sex + " " + designation);
var race = webManager.getElementTextByxPath(getOffenderInfoBasic(2, 1));
var ethnicity = webManager.getElementTextByxPath(getOffenderInfoBasic(2, 2));
var height = webManager.getElementTextByxPath(getOffenderInfoBasic(2, 3));
var weight = webManager.getElementTextByxPath(getOffenderInfoBasic(2, 4));
var hair = webManager.getElementTextByxPath(getOffenderInfoBasic(2, 5));
var eyes = webManager.getElementTextByxPath(getOffenderInfoBasic(2, 6));
var lenses = webManager.getElementTextByxPath(getOffenderInfoBasic(2, 7));
var photodate = webManager.getElementTextByxPath(getOffenderInfoBasic(2, 8));
var jurisdiction = webManager.getElementTextByxPath(jurisductionxPath);
// ------------ Logic for addresses ------------------------
Address[] addresses;
List<Address> addressList = new List<Address>();
for (int x = 1; x < 20; x++)
{
try
{
var address_1 = webManager.driver.FindElement(By.XPath(getOffenderAddress(x, 1)));
if (address_1 != null)
{
Address adds = new Address();
adds.type = webManager.getElementTextByxPath(getOffenderAddress(x, 1));
adds.county = webManager.getElementTextByxPath(getOffenderAddress(x, 2));
adds.location = webManager.getElementTextByxPath(getOffenderAddress(x, 3));
addressList.Add(adds);
}
}
catch (NoSuchElementException e1)
{
break;
}
}
Console.WriteLine(addressList.Count > 1 ? "Multiple addresses... listing" : "Only one address found");
foreach (Address aa in addressList)
{
Console.WriteLine(aa.ToString());
}
addresses = addressList.ToArray();
// --------------- end of address logic --------------------
//---------- Current Conviction logic -----------------------
Conviction currentConviction = new Conviction();
ConvictionDetails[] convictionDetails;
List<ConvictionDetails> currentConvictionDetails = new List<ConvictionDetails>();
for (int x = 1; x < 20; x++)
{
try
{
/*
* Not happy about this but it has to be done this way
*
* Checks the span[1] to see if it is still a conviction or if
* it starts the list of information.
*
* */
var spanTitle = webManager.driver.FindElement(By.XPath(getConvictionTitlexPath(x)));
if (spanTitle.Text.Contains("Date"))
break;
var title = webManager.driver.FindElement(By.XPath(getConvictionDetailsxPath(x, 1)));
var section = webManager.getElementTextByxPath(getConvictionDetailsxPath(x, 2));
var subsection = webManager.getElementTextByxPath(getConvictionDetailsxPath(x, 3));
var c_class = webManager.getElementTextByxPath(getConvictionDetailsxPath(x, 4));
var categlory = webManager.getElementTextByxPath(getConvictionDetailsxPath(x, 5));
var counts = webManager.getElementTextByxPath(getConvictionDetailsxPath(x, 6));
var desc = webManager.getElementTextByxPath(getConvictionDetailsxPath(x, 7));
ConvictionDetails cDetails = new ConvictionDetails();
cDetails.c_class = c_class;
cDetails.categlory = categlory;
cDetails.counts = counts;
cDetails.description = desc;
cDetails.section = section;
cDetails.title = title.Text;
cDetails.subsection = subsection;
currentConvictionDetails.Add(cDetails);
}
catch (NoSuchElementException e1)
{
break;
}
}
convictionDetails = currentConvictionDetails.ToArray();
var dateOfCrime = webManager.getElementTextByxPath(getConvictionDataxPath(currentConvictionDetails.Count + 1, 1));
var convictionDate = webManager.getElementTextByxPath(getConvictionDataxPath(currentConvictionDetails.Count + 1, 2));
var victiminfo = webManager.getElementTextByxPath(getConvictionDataxPath(currentConvictionDetails.Count + 1, 3));
var arrestingAgency = webManager.getElementTextByxPath(getConvictionDataxPath(currentConvictionDetails.Count + 1, 4));
var offenseDescription = webManager.getElementTextByxPath(getConvictionDataxPath(currentConvictionDetails.Count + 1, 5));
var relationship = webManager.getElementTextByxPath(getConvictionDataxPath(currentConvictionDetails.Count + 1, 6));
var weapon = webManager.getElementTextByxPath(getConvictionDataxPath(currentConvictionDetails.Count + 1, 7));
var force = webManager.getElementTextByxPath(getConvictionDataxPath(currentConvictionDetails.Count + 1, 8));
var computer = webManager.getElementTextByxPath(getConvictionDataxPath(currentConvictionDetails.Count + 1, 9));
var porn = webManager.getElementTextByxPath(getConvictionDataxPath(currentConvictionDetails.Count + 1, 10));
var sentance = webManager.getElementTextByxPath(getConvictionDataxPath(currentConvictionDetails.Count + 1, 11));
currentConviction.arrestingAgency = arrestingAgency;
currentConviction.computerUsed = computer;
currentConviction.convictionDate = convictionDate;
currentConviction.crimeDate = dateOfCrime;
currentConviction.forceUsed = force;
currentConviction.offenseDescription = offenseDescription;
currentConviction.pornInvolved = porn;
currentConviction.relationship = relationship;
currentConviction.sentance = sentance;
currentConviction.victimInfo = victiminfo;
currentConviction.weaponsUsed = weapon;
currentConviction.details = convictionDetails;
Console.WriteLine("-------Current Conviction --------");
Console.WriteLine(currentConviction.ToString());
//----------- End Current Conviction logic -------------------
//----------- Pervious Conviction logic ----------------------
Conviction[] previousConvictions = null;
int lastDiv = 0;
List<Conviction> previousConvictionsList = new List<Conviction>();
for (int x = 3; x < 10; x++)
{
List<ConvictionDetails> prevConvictionDetailsList = new List<ConvictionDetails>();
int last = 0;
try
{
for (int y = 1; y < 10; y++)
{
try
{
var spanTitle = webManager.driver.FindElement(By.XPath(getListTitlexPathByDiv(x, y)));
if (!spanTitle.Text.Contains("Title"))
break;
var title = webManager.getElementTextByxPath(getListxPathByDiv(x, y, 1));
var section = webManager.getElementTextByxPath(getListxPathByDiv(x, y, 2));
var subsection = webManager.getElementTextByxPath(getListxPathByDiv(x, y, 3));
var c_class = webManager.getElementTextByxPath(getListxPathByDiv(x, y, 4));
var categlory = webManager.getElementTextByxPath(getListxPathByDiv(x, y, 5));
var counts = webManager.getElementTextByxPath(getListxPathByDiv(x, y, 6));
var desc = webManager.getElementTextByxPath(getListxPathByDiv(x, y, 7));
ConvictionDetails prevconvictionDetails = new ConvictionDetails();
prevconvictionDetails.c_class = c_class;
prevconvictionDetails.categlory = categlory;
prevconvictionDetails.counts = counts;
prevconvictionDetails.description = desc;
prevconvictionDetails.section = section;
prevconvictionDetails.title = title;
prevconvictionDetails.subsection = subsection;
prevConvictionDetailsList.Add(prevconvictionDetails);
}
catch (NoSuchElementException)
{
break;
}
}
// keeps track of the divisions but putting it here in stack will increase by 1 always will account for it later
lastDiv = x;
if (prevConvictionDetailsList.Count == last)
break;
last = prevConvictionDetailsList.Count;
webManager.driver.FindElement(By.XPath(getPreviousMoreInfoButton(x))).Click();
Thread.Sleep(1000);
var prevDateOfCrime = webManager.getElementTextByxPath(getListxPathByDiv(x, prevConvictionDetailsList.Count + 1, 1));
var prevConvictionDate = webManager.getElementTextByxPath(getListxPathByDiv(x, prevConvictionDetailsList.Count + 1, 2));
var prevVictiminfo = webManager.getElementTextByxPath(getListxPathByDiv(x, prevConvictionDetailsList.Count + 1, 3));
var prevArrestingAgency = webManager.getElementTextByxPath(getListxPathByDiv(x, prevConvictionDetailsList.Count + 1, 4));
var prevOffenseDescription = webManager.getElementTextByxPath(getListxPathByDiv(x, prevConvictionDetailsList.Count + 1, 5));
var prevRelationship = webManager.getElementTextByxPath(getListxPathByDiv(x, prevConvictionDetailsList.Count + 1, 6));
var prevWeapon = webManager.getElementTextByxPath(getListxPathByDiv(x, prevConvictionDetailsList.Count + 1, 7));
var prevForce = webManager.getElementTextByxPath(getListxPathByDiv(x, prevConvictionDetailsList.Count + 1, 8));
var prevComputer = webManager.getElementTextByxPath(getListxPathByDiv(x, prevConvictionDetailsList.Count + 1, 9));
var prevPorn = webManager.getElementTextByxPath(getListxPathByDiv(x, prevConvictionDetailsList.Count + 1, 10));
var prevSentance = webManager.getElementTextByxPath(getListxPathByDiv(x, prevConvictionDetailsList.Count + 1, 11));
Conviction previousConviction = new Conviction();
previousConviction.arrestingAgency = prevArrestingAgency;
previousConviction.computerUsed = prevComputer;
previousConviction.convictionDate = prevConvictionDate;
previousConviction.crimeDate = prevDateOfCrime;
previousConviction.forceUsed = prevForce;
previousConviction.offenseDescription = prevOffenseDescription;
previousConviction.pornInvolved = prevPorn;
previousConviction.relationship = prevRelationship;
previousConviction.sentance = prevSentance;
previousConviction.victimInfo = prevVictiminfo;
previousConviction.weaponsUsed = prevWeapon;
previousConviction.details = prevConvictionDetailsList.ToArray();
previousConvictionsList.Add(previousConviction);
}
catch (NoSuchElementException)
{
break;
}
}
if (previousConvictionsList.Count > 0)
{
previousConvictions = previousConvictionsList.ToArray();
Console.WriteLine("-----Previous convictions------");
foreach (Conviction c in previousConvictions)
{
Console.WriteLine(c.ToString());
}
}
//-------------- End of Conviction logic ---------------------
//-----------------Beginning of Supervising until Scars--------
int adjustedParagraph = (previousConvictions == null ? 4 : 3);
var supervisingAgency = webManager.getElementTextByxPath(getMainContentParagraph(adjustedParagraph));
var specialConditions = webManager.getElementTextByxPath(getMainContentParagraph(adjustedParagraph + 1));
var maximumExpire = webManager.getElementTextByxPath(getMainContentParagraph(adjustedParagraph + 2));
//-----------------End of Supervising until Scars--------------
//---------------- Scars logic --------------------------------
/*
* At this point the last paragraph used was adjustedParagrpah + 2
* */
int lastParagraph = 0;
List<string> markingList = new List<string>();
String[] markings = null;
for (int x = (adjustedParagraph + 3); x < (adjustedParagraph + 13); x++)
{
var marking1 = webManager.getElementTextByxPath(getMainContentParagraph(x));
if (marking1.Contains("None"))
{
lastParagraph = x;
break;
}
var aliasHeadingEle = webManager.getElementByxPath(aliasHeadingxPath);
var webEle1 = webManager.getElementByxPath(getMainContentParagraph(x));
if (aliasHeadingEle.Location.Y > webEle1.Location.Y)
{
markingList.Add(webEle1.Text);
lastParagraph = x;
}
else
break;
}
markings = markingList.ToArray();
//------------------ End Scars logic -------------------------
//------------------------ Alias Logic --------------------
int lastParagraph2 = 0;
List<string> aliasList = new List<string>();
String[] aliases = null;
for (int x = (lastParagraph + 1); x < (lastParagraph + 10); x++)
{
try
{
var alias1 = webManager.driver.FindElement(By.XPath(getMainContentParagraph(x)));
if (alias1.Text.Contains("None"))
{
lastParagraph2 = x;
break;
}
}
catch (NoSuchElementException)
{
break;
}
var currentVehicleHeading = webManager.driver.FindElement(By.XPath(currentVehiclexPath));
var webEle1 = webManager.driver.FindElement(By.XPath(getMainContentParagraph(x)));
if (currentVehicleHeading.Location.Y > webEle1.Location.Y)
{
aliasList.Add(webEle1.Text);
Console.WriteLine("Offender has alias: " + webEle1.Text);
lastParagraph2 = x;
}
else
break;
}
aliases = aliasList.ToArray();
//------------------- End Alias logic -------------------
//--------------------- Vehicle Logic ---------------------------
/*
* I feel bad about doing work in a catch... but for some reason I can't think
* of a better way at the momment so I am just going with it.
*
* If you can make this logic better please do so...
*
* The lastdiv is already 1 more than last used due to placement read comments above
*
* */
Vehicle[] vehicles = null;
try
{
var vehicleElement = webManager.driver.FindElement(By.XPath(getVehiclePxPath(lastDiv)));
}
catch (NoSuchElementException)
{
List<Vehicle> vehicleList = new List<Vehicle>();
for (int x = 1; x < 10; x++)
{
try
{
var vehiclePlate = webManager.driver.FindElement(By.XPath(getListxPathByDiv(lastDiv, x, 1)));
var vehicleState = webManager.driver.FindElement(By.XPath(getListxPathByDiv(lastDiv, x, 2)));
var vehicleYear = webManager.driver.FindElement(By.XPath(getListxPathByDiv(lastDiv, x, 3)));
var vehicleModel = webManager.driver.FindElement(By.XPath(getListxPathByDiv(lastDiv, x, 4)));
var vehicleColor = webManager.driver.FindElement(By.XPath(getListxPathByDiv(lastDiv, x, 5)));
Vehicle vehicle1 = new Vehicle();
vehicle1.color = vehicleColor.Text;
vehicle1.makeModel = vehicleModel.Text;
vehicle1.plate = vehiclePlate.Text;
vehicle1.state = vehicleState.Text;
vehicle1.year = vehicleYear.Text;
vehicleList.Add(vehicle1);
}
catch (NoSuchElementException)
{
break;
}
}
vehicles = vehicleList.ToArray();
}
//--------------------- End Vehicle Logic -------------------------
//-------- Creating & Adding fields into Offender Object----------
Offender offender = new Offender();
offender.currentPlacement = currentPlacement;
offender.designation = designation;
offender.ethnicity = ethnicity;
offender.dob = dob;
offender.eyeColor = eyes;
offender.hairColor = hair;
offender.CorrectiveLens = lenses;
offender.height = height;
offender.weight = weight;
offender.photoDate = photodate;
offender.offenderId = offenderId;
offender.riskLevel = riskLevel;
offender.race = race;
offender.sex = sex;
offender.lastName = lastName;
offender.firstName = firstName;
offender.middleName = middleName;
offender.address = addresses;
offender.jurisdiction = jurisdiction;
offender.currentConviction = currentConviction;
offender.perviousConvictions = previousConvictions;
offender.supervisingInfo = supervisingAgency;
offender.conditions = specialConditions;
offender.maximumDate = maximumExpire;
offender.markings = markings;
offender.aliases = aliases;
offender.currentVehicles = vehicles;
offender.linkToPic = getPhotoLink(offenderId);
offender.Save(utils.getSaveLocation(locationStr, offender.offenderId));
//------ add to completed offender id list --------
completedList.Add(offenderId);
webManager.driver.Navigate().GoToUrl(getOffenderListURL((localScrape ? localCounties[i] : i)));
}
catch (Exception e)
{
Console.WriteLine(e.Message);
Console.WriteLine("Last offender id " + lastOffenderId);
break;
}
}
}
webManager.close();
}
Upvotes: 3
Views: 364