Reputation: 1099
I have 2 files : one is MAIN.py
and other is xpath_from_element.js
. I used QWebEngineWidgets
to display the web-page on the app, And linked both .py
and .js
to get TagNames
/XPATH
from the website when one clicks on it. I am currently getting tagNames
of the elements.
What if i have to get tagNames
and classNames
both at once. For example : clicking on a button gives me both tagName
and className
at once. Because currently it's giving me only one. I have tried doing that, but it always gives me error, so i have posted the original fine running code and not my error producing code cause i don't know how to do that and was unable to find any resources to do so.
Here is Main.py :
from PySide2 import QtCore, QtWidgets, QtWebEngineWidgets, QtWebChannel, QtGui
from PySide2.QtWidgets import QApplication, QComboBox, QWidget, QVBoxLayout, QSizeGrip, QGridLayout, QInputDialog, QLineEdit
from jinja2 import Template
import sqlite3
import time
from D_scraper import to_do
import D_scraper2
conn = sqlite3.connect(r'C:\Users\intel\Desktop\Crawl_jar\test.db')
try:
conn.execute("""CREATE TABLE Classes (id VARCHAR(30));""")
conn.commit()
conn.execute("""CREATE TABLE urls (id VARCHAR(30));""")
conn.commit()
except:
pass
def insert_url_to_sql(the_url):
conn.execute(f"""INSERT INTO urls VALUES ("{the_url}");""")
conn.commit()
class Form(QtWidgets.QWidget):
def __init__(self, parent=None):
super(Form, self).__init__(parent)
self.setWindowTitle('test')
self.setMinimumSize(320, 240)
self.resize(640, 480)
self.btn = QtWidgets.QPushButton(self)
self.btn.move(15,6)
self.btn.setFixedSize(80, 30)
self.btn.setText("Excecute")
self.btn.clicked.connect(self.run_myscript)
self.text_url = QLineEdit(self)
self.text_url.move(100, 6)
self.text_url.setFixedSize(140, 30)
self.btn2 = QtWidgets.QPushButton(self)
self.btn2.move(250,6)
self.btn2.setFixedSize(80, 30)
self.btn2.setText("Load")
self.btn2.clicked.connect(self.load_the_url)
self.cb = QComboBox(self)
self.cb.addItem("1")
self.cb.addItem("2")
self.cb.addItems(["3", "4", "5"])
self.cb.currentIndexChanged.connect(self.selectionchange)
self.cb.move(350, 7)
self.cb.setFixedSize(35, 28)
self.cb.show()
self.show()
def run_myscript(self):
if self.combo_value == "3":
window.hide()
app.quit()
D_scraper2.to_do()
else:
print("Error no option selected!")
print("Select a value...")
def selectionchange(self):
print ("Items in the list are :")
self.combo_value = self.cb.currentText()
print (self.combo_value)
# Show widget
def load_the_url(self):
self.url_string = self.text_url.text()
self.u1 = self.url_string.replace('(','')
self.u2 = self.u1.replace(')','')
self.u3 = self.u2.replace("'",'')
self.u4 = self.u3.replace(",",'')
self.u5 = self.u4.replace("True",'')
self.u6 = self.u5.replace(" ",'')
self.url = self.u6
print(self.u6)
self.view = QtWebEngineWidgets.QWebEngineView(self)
self.view.move(10, 80)
self.view.setFixedSize(1345, 500)
classname_helper = Helper("classname_helper")
classname_helper.classClicked.connect(self.on_clicked)
self.page = WebEnginePage()
self.page.add_object(classname_helper)
self.view.setPage(self.page)
self.view.load(QtCore.QUrl(self.url))
print(str(self.url))
self.view.show()
def on_clicked(self, name):
print(name)
conn.execute(f"""INSERT INTO Classes VALUES ("{name}");""")
conn.commit()
insert_url_to_sql(self.url)
class Element(QtCore.QObject):
def __init__(self, name, parent=None):
super(Element, self).__init__(parent)
self._name = name
@property
def name(self):
return self._name
def script(self):
return ""
class WebEnginePage(QtWebEngineWidgets.QWebEnginePage):
def __init__(self, parent=None):
super(WebEnginePage, self).__init__(parent)
self.loadFinished.connect(self.onLoadFinished)
self._objects = []
self._scripts = []
def add_object(self, obj):
self._objects.append(obj)
@QtCore.Slot(bool)
def onLoadFinished(self, ok):
print("Finished loading: ", ok)
if ok:
self.load_qwebchannel()
self.add_objects()
def load_qwebchannel(self):
file = QtCore.QFile(":/qtwebchannel/qwebchannel.js")
if file.open(QtCore.QIODevice.ReadOnly):
content = file.readAll()
file.close()
self.runJavaScript(content.data().decode())
if self.webChannel() is None:
channel = QtWebChannel.QWebChannel(self)
self.setWebChannel(channel)
def add_objects(self):
if self.webChannel() is not None:
objects = {obj.name: obj for obj in self._objects}
self.webChannel().registerObjects(objects)
_script = """
{% for obj in objects %}
var {{obj}};
{% endfor %}
new QWebChannel(qt.webChannelTransport, function (channel) {
{% for obj in objects %}
{{obj}} = channel.objects.{{obj}};
{% endfor %}
});
"""
self.runJavaScript(Template(_script).render(objects=objects.keys()))
for obj in self._objects:
if isinstance(obj, Element):
self.runJavaScript(obj.script())
class Helper(Element):
classClicked = QtCore.Signal(str)
def script(self):
js = """
document.addEventListener('click', function(e) {
e = e || window.event;
var target = e.target || e.srcElement;
e.preventDefault()
{{name}}.objectClicked(target.tagName);
}, false);"""
return Template(js).render(name=self.name)
@QtCore.Slot(str)
def objectClicked(self, className):
if className:
self.classClicked.emit(className)
if __name__ == "__main__":
import sys
app = QtWidgets.QApplication(sys.argv)
window = Form()
window.show()
sys.exit(app.exec_())
The .JS File : (to get the classNames/tagNames when clicked)
Elements = {};
Elements.DOMPath = {};
/**
* @param {!Node} node
* @param {boolean=} optimized
* @return {string}
*/
Elements.DOMPath.xPath = function (node, optimized) {
if (node.nodeType === Node.DOCUMENT_NODE) {
return '/';
}
const steps = [];
let contextNode = node;
while (contextNode) {
const step = Elements.DOMPath._xPathValue(contextNode, optimized);
if (!step) {
break;
} // Error - bail out early.
steps.push(step);
if (step.optimized) {
break;
}
contextNode = contextNode.parentNode;
}
steps.reverse();
return (steps.length && steps[0].optimized ? '' : '/') + steps.join('/');
};
/**
* @param {!Node} node
* @param {boolean=} optimized
* @return {?Elements.DOMPath.Step}
*/
Elements.DOMPath._xPathValue = function (node, optimized) {
let ownValue;
const ownIndex = Elements.DOMPath._xPathIndex(node);
if (ownIndex === -1) {
return null;
} // Error.
switch (node.nodeType) {
case Node.ELEMENT_NODE:
if (optimized && node.getAttribute('id')) {
return new Elements.DOMPath.Step('//*[@id="' + node.getAttribute('id') + '"]', true);
}
ownValue = node.localName;
break;
case Node.ATTRIBUTE_NODE:
ownValue = '@' + node.nodeName;
break;
case Node.TEXT_NODE:
case Node.CDATA_SECTION_NODE:
ownValue = 'text()';
break;
case Node.PROCESSING_INSTRUCTION_NODE:
ownValue = 'processing-instruction()';
break;
case Node.COMMENT_NODE:
ownValue = 'comment()';
break;
case Node.DOCUMENT_NODE:
ownValue = '';
break;
default:
ownValue = '';
break;
}
if (ownIndex > 0) {
ownValue += '[' + ownIndex + ']';
}
return new Elements.DOMPath.Step(ownValue, node.nodeType === Node.DOCUMENT_NODE);
};
/**
* @param {!Node} node
* @return {number}
*/
Elements.DOMPath._xPathIndex = function (node) {
// Returns -1 in case of error, 0 if no siblings matching the same expression,
// <XPath index among the same expression-matching sibling nodes> otherwise.
function areNodesSimilar(left, right) {
if (left === right) {
return true;
}
if (left.nodeType === Node.ELEMENT_NODE && right.nodeType === Node.ELEMENT_NODE) {
return left.localName === right.localName;
}
if (left.nodeType === right.nodeType) {
return true;
}
// XPath treats CDATA as text nodes.
const leftType = left.nodeType === Node.CDATA_SECTION_NODE ? Node.TEXT_NODE : left.nodeType;
const rightType = right.nodeType === Node.CDATA_SECTION_NODE ? Node.TEXT_NODE : right.nodeType;
return leftType === rightType;
}
const siblings = node.parentNode ? node.parentNode.children : null;
if (!siblings) {
return 0;
} // Root node - no siblings.
let hasSameNamedElements;
for (let i = 0; i < siblings.length; ++i) {
if (areNodesSimilar(node, siblings[i]) && siblings[i] !== node) {
hasSameNamedElements = true;
break;
}
}
if (!hasSameNamedElements) {
return 0;
}
let ownIndex = 1; // XPath indices start with 1.
for (let i = 0; i < siblings.length; ++i) {
if (areNodesSimilar(node, siblings[i])) {
if (siblings[i] === node) {
return ownIndex;
}
++ownIndex;
}
}
return -1; // An error occurred: |node| not found in parent's children.
};
/**
* @unrestricted
*/
Elements.DOMPath.Step = class {
/**
* @param {string} value
* @param {boolean} optimized
*/
constructor(value, optimized) {
this.value = value;
this.optimized = optimized || false;
}
/**
* @override
* @return {string}
*/
toString() {
return this.value;
}
};
Also please do mention how you solved this?
Upvotes: 1
Views: 228
Reputation: 243897
The logic is similar to my other answers that I have provided, but you may not understand it because you probably have not analyzed it in detail.
The logic is:
Create a class that inherits from Element and override the script method that implements the logic of the program (in this case load the script xpath_from_element.js, implement the algorithm of obtaining the clicked element, obtain the necessary attributes and send them to a slot) .
Create a signal with the necessary number of arguments (in this case the xpath and tagName)
Create a slot where the information is received and emit the signal.
With the above I hope you understand the simple logic of my implementation.
import os
from PySide2 import QtCore, QtGui, QtWidgets, QtWebEngineWidgets, QtWebChannel
from jinja2 import Template
CURRENT_DIR = os.path.dirname(os.path.realpath(__file__))
class Element(QtCore.QObject):
def __init__(self, name, parent=None):
super(Element, self).__init__(parent)
self._name = name
@property
def name(self):
return self._name
def script(self):
return ""
class WebEnginePage(QtWebEngineWidgets.QWebEnginePage):
def __init__(self, parent=None):
super(WebEnginePage, self).__init__(parent)
self.loadFinished.connect(self.onLoadFinished)
self._objects = []
self._scripts = []
def add_object(self, obj):
self._objects.append(obj)
@QtCore.Slot(bool)
def onLoadFinished(self, ok):
print("Finished loading: ", ok)
if ok:
self.load_qwebchannel()
self.add_objects()
def load_qwebchannel(self):
file = QtCore.QFile(":/qtwebchannel/qwebchannel.js")
if file.open(QtCore.QIODevice.ReadOnly):
content = file.readAll()
file.close()
self.runJavaScript(content.data().decode())
if self.webChannel() is None:
channel = QtWebChannel.QWebChannel(self)
self.setWebChannel(channel)
def add_objects(self):
if self.webChannel() is not None:
objects = {obj.name: obj for obj in self._objects}
self.webChannel().registerObjects(objects)
_script = """
{% for obj in objects %}
var {{obj}};
{% endfor %}
new QWebChannel(qt.webChannelTransport, function (channel) {
{% for obj in objects %}
{{obj}} = channel.objects.{{obj}};
{% endfor %}
});
"""
self.runJavaScript(Template(_script).render(objects=objects.keys()))
for obj in self._objects:
if isinstance(obj, Element):
self.runJavaScript(obj.script())
class Helper(Element):
elementClicked = QtCore.Signal(str, str)
def script(self):
js = ""
file = QtCore.QFile(os.path.join(CURRENT_DIR, "xpath_from_element.js"))
if file.open(QtCore.QIODevice.ReadOnly):
content = file.readAll()
file.close()
js = content.data().decode()
js += """
document.addEventListener('click', function(e) {
e = e || window.event;
var target = e.target || e.srcElement;
var xpath = Elements.DOMPath.xPath(target, false);
var tagName = target.tagName;
{{name}}.received_data(xpath, tagName);
}, false);"""
return Template(js).render(name=self.name)
@QtCore.Slot(str, str)
def received_data(self, xpath, tagName):
self.elementClicked.emit(xpath, tagName)
class Form(QtWidgets.QWidget):
def __init__(self, parent=None):
super(Form, self).__init__(parent)
self.setWindowTitle("test")
self.setMinimumSize(320, 240)
self.resize(640, 480)
self.execute_btn = QtWidgets.QPushButton(self.tr("Execute"))
self.load_btn = QtWidgets.QPushButton(self.tr("Load"))
self.url_le = QtWidgets.QLineEdit()
self.cb = QtWidgets.QComboBox()
self.view = QtWebEngineWidgets.QWebEngineView()
self.page = WebEnginePage(self)
self.view.setPage(self.page)
classname_helper = Helper("classname_helper")
classname_helper.elementClicked.connect(self.on_clicked)
self.page.add_object(classname_helper)
gridlayout = QtWidgets.QGridLayout(self)
gridlayout.addWidget(self.execute_btn, 0, 0)
gridlayout.addWidget(self.url_le, 0, 1)
gridlayout.addWidget(self.load_btn, 0, 2)
gridlayout.addWidget(self.cb, 0, 3)
gridlayout.addWidget(self.view, 1, 0, 4, 0)
self.cb.addItems(["1", "2", "3", "4", "5"])
self.execute_btn.clicked.connect(self.run_myscript)
self.load_btn.clicked.connect(self.load_the_url)
self.cb.currentIndexChanged.connect(self.selectionchange)
def run_myscript(self):
pass
def selectionchange(self):
print("Items in the list are :")
self.combo_value = self.cb.currentText()
print(self.combo_value)
def load_the_url(self):
text = self.url_le.text()
for letter in ("(", ")", "'", ",", "True", " "):
text = text.replace(letter, "")
self.view.load(QtCore.QUrl(text))
def on_clicked(self, xpath, tagName):
print("on_clicked:", xpath, tagName)
if __name__ == "__main__":
import sys
app = QtWidgets.QApplication(sys.argv)
window = Form()
window.show()
sys.exit(app.exec_())
Upvotes: 1