Browse Source

Switched parsing and fetching backend from BeautifulSoup4, httplib2 to PyQt.

experimental
Bogdan Cordier 7 years ago
parent
commit
e4d74d4454
3 changed files with 166 additions and 79 deletions
  1. 1
    0
      TODO
  2. 163
    77
      kcnrtl/kcnrtl.py
  3. 2
    2
      setup.py

+ 1
- 0
TODO View File

@@ -1,6 +1,7 @@
1 1
 TODO
2 2
 
3 3
 * Show error when word isn't found
4
+* Switch to Qt backend to remove httplib2 and BeautifulSoup4 dependencies
4 5
 * Allow to choose word class for antonyms and synonyms
5 6
 * Add some proximity indicator for synonyms and antonyms as on the CNRTL
6 7
 * Ajouter un mode de priorité pour les requête en fonction de l'onglet

+ 163
- 77
kcnrtl/kcnrtl.py View File

@@ -25,6 +25,8 @@ import httplib2
25 25
 from bs4 import BeautifulSoup
26 26
 from PyQt4.QtCore import *
27 27
 from PyQt4.QtGui import *
28
+from PyQt4.QtNetwork import *
29
+from PyQt4.QtWebKit import QWebPage
28 30
 from gui.Ui_kcnrtl import Ui_MainWindow
29 31
 import re
30 32
 import shutil
@@ -52,6 +54,8 @@ class Main(QMainWindow):
52 54
         self.ui.comboBox.setCurrentIndex(0)
53 55
 
54 56
         self.ui.checkBox.setChecked(False)
57
+
58
+        self.manager = QNetworkAccessManager()
55 59
                 
56 60
         self.dictionaries = [u"TLFi",
57 61
                              u"Académie 9e Ed.",
@@ -73,22 +77,36 @@ class Main(QMainWindow):
73 77
 
74 78
         self.clipboard.dataChanged.connect(self.autoGetFromClipboard)
75 79
 
80
+        self.manager.finished.connect(self.replyFinished)
81
+
82
+        self.loop = QEventLoop()
83
+
84
+        self.manager.finished.connect(self.loop.quit)
85
+
86
+        self.tagform = []
87
+
76 88
     def updateUi(self):
77
-        try:
78
-            # Check if input text is a word
79
-            if len(unicode(self.ui.lineEdit.text()).split()) <= 1:
80
-                self.typed = unicode(self.ui.lineEdit.text())
81
-                self.getLexi(self.typed)
82
-                self.ui.comboBox.clear()
83
-                self.ui.comboBox.addItems(self.lexiForm())
84
-                self.lexiContent()
85
-                self.ui.listView.setModel(self.getSynoAnto("synonymie"))
86
-                self.ui.listView_2.setModel(self.getSynoAnto("antonymie"))
87
-            else:
88
-                self.ui.lineEdit.setText("Veuillez entrer UN mot")
89
-
90
-        except:
91
-            self.ui.lineEdit.setText("Veuillez entrer un mot")
89
+        # Check if input text is a word
90
+        if len(unicode(self.ui.lineEdit.text()).split()) <= 1:
91
+            self.formtype = "definition"
92
+            self.fetch("Lexi")
93
+            self.ui.comboBox.clear()
94
+            self.ui.comboBox.addItems(self.tagform)
95
+            self.formtype = "synonyme"
96
+            self.fetch("Syno")
97
+            self.formtype = "antonyme"
98
+            self.fetch("Anto")
99
+#            self.typed = unicode(self.ui.lineEdit.text())
100
+#            self.getLexi(self.typed)
101
+#            self.ui.comboBox.clear()
102
+#            self.ui.comboBox.addItems(self.lexiForm())
103
+#            self.lexiContent()
104
+#            self.ui.listView.setModel(self.getSynoAnto("synonymie"))
105
+#            self.ui.listView_2.setModel(self.getSynoAnto("antonymie"))
106
+        else:
107
+            self.ui.lineEdit.setText("Veuillez entrer UN mot")
108
+
109
+
92 110
             
93 111
 #    # TODO: Dynamically adjust dictionaries name to windows size
94 112
 #    def resizeEvent(self, event):
@@ -103,81 +121,149 @@ class Main(QMainWindow):
103 121
     def onRowClicked(self, qmodelindex):
104 122
         item = qmodelindex.data(Qt.DisplayRole).toString()
105 123
         self.clipboard.setText(item)
106
-
124
+#
107 125
     def autoGetFromClipboard(self):
108 126
         if self.ui.checkBox.isChecked():
109 127
             self.ui.lineEdit.setText(unicode(self.clipboard.text()))
110 128
             self.updateUi()
111
-
129
+#
112 130
     def onComboChange(self):
113
-        self.getLexi(self.typed)
114
-        self.lexiContent()
131
+        self.formtype = "definition"
132
+        self.fetch("Lexi")
115 133
     
116
-    def getSynoAnto(self, form):
117
-        tag = []
118
-        soup = BeautifulSoup(self.getHtml(self.typed, form))
119
-        tagy = soup.find_all('td', "%s_format" % (form[:4]))
120
-        i = 0
121
-        while i < len(tagy):
122
-            tag_a = tagy[i]
123
-            tag.append(tag_a.text)
124
-            i += 1
125
-        model = ListModel(tag, self)
126
-        return model
127
-     
128
-    def getLexi(self, text):
129
-        h = self.getHtml(text, "definition")
130
-        global soup
131
-        soup = BeautifulSoup(h, "lxml")
132
-        return soup
133
-
134
-    def lexiContent(self):
135
-        tagkeep = soup.find_all('div', {'id': 'contentbox'})
136
-        if not self.ui.comboBox_2.currentIndex():
137
-            tagrm = soup.find_all('div', {'class': 'tlf_cvedette'})
138
-        if 1 <= self.ui.comboBox_2.currentIndex() <= 3:
139
-            tagrm = soup.find_all('span', {'class': 'tlf_cvedette'})
140
-        tag = str(tagkeep[0]).replace(str(tagrm[0]),'')
141
-        self.ui.webView.setHtml(tag.decode('utf8'))
142
-        return tag
143
-
144
-    # Check if there is more than one definition
145
-    def lexiForm(self):
146
-        a = re.compile("return sendRequest\(5,'/definition/.*")
147
-        multdef = soup.find_all('a', {'onclick': a})
148
-        tagform = []
149
-        i = 0
150
-        while i < len(multdef):
151
-            multdef_a = multdef[i]
152
-            # Delete digits in definition title
153
-            multdef_clean = ''.join(c for c in
154
-                                    multdef_a.text if not c.isdigit())
155
-            tagform.append(multdef_clean)
156
-            i += 1
157
-        return tagform
158
-
159
-    def getHtml(self, text, form):
160
-        conn = httplib2.Http('.kcnrtl_cache')
161
-        numdef = self.ui.comboBox.currentIndex()
162
-        if form == "definition":
134
+#    def getSynoAnto(self, form):
135
+#        tag = []
136
+#        soup = BeautifulSoup(self.getHtml(self.typed, form), "lxml")
137
+#        tagy = soup.find_all('td', "%s_format" % (form[:4]))
138
+#        i = 0
139
+#        while i < len(tagy):
140
+#            tag_a = tagy[i]
141
+#            tag.append(tag_a.text)
142
+#            i += 1
143
+#        model = ListModel(tag, self)
144
+#        return model
145
+#
146
+#    def getLexi(self, text):
147
+#        h = self.getHtml(text, "definition")
148
+#        global soup
149
+#        soup = BeautifulSoup(h, "lxml")
150
+#
151
+#    def lexiContent(self):
152
+#        tagkeep = soup.find_all('div', {'id': 'contentbox'})
153
+#        if not self.ui.comboBox_2.currentIndex():
154
+#            tagrm = soup.find_all('div', {'class': 'tlf_cvedette'})
155
+#        if 1 <= self.ui.comboBox_2.currentIndex() <= 3:
156
+#            tagrm = soup.find_all('span', {'class': 'tlf_cvedette'})
157
+#        tag = str(tagkeep[0]).replace(str(tagrm[0]),'')
158
+#        self.ui.webView.setHtml(tag.decode('utf8'))
159
+#        return tag
160
+#
161
+#    # Check if there is more than one definition
162
+#    def lexiForm(self):
163
+#        a = re.compile("return sendRequest\(5,'/definition/.*")
164
+#        multdef = soup.find_all('a', {'onclick': a})
165
+#        tagform = []
166
+#        i = 0
167
+#        while i < len(multdef):
168
+#            multdef_a = multdef[i]
169
+#            # Delete digits in definition title
170
+#            multdef_clean = ''.join(c for c in
171
+#                                    multdef_a.text if not c.isdigit())
172
+#            tagform.append(multdef_clean)
173
+#            i += 1
174
+#        return tagform
175
+#
176
+#    def getHtml(self, text, form):
177
+#        conn = httplib2.Http('.kcnrtl_cache')
178
+#        numdef = self.ui.comboBox.currentIndex()
179
+#        if form == "definition":
180
+#            if not self.ui.comboBox_2.currentIndex():
181
+#                htmlSource = conn.request("http://www.cnrtl.fr/%s/%s//%s" %
182
+#                                          (form, text, numdef), "GET")
183
+#            if self.ui.comboBox_2.currentIndex() > 0:
184
+#                acad = unicode(self.ui.comboBox_2.currentText())
185
+#                acadnum = filter(lambda x: x.isdigit(), acad)
186
+#                acadnumf = "academie" + str(acadnum)
187
+#                htmlSource = conn.request("http://www.cnrtl.fr/%s/%s/%s//%s" %
188
+#                                          (form, acadnumf, text, numdef), "GET")
189
+#
190
+#        else:
191
+#            htmlSource = conn.request("http://www.cnrtl.fr/%s/%s" %
192
+#                                      (form, text), "GET")
193
+#        return htmlSource[1]
194
+
195
+
196
+    def fetch(self, dico):
197
+        if dico == "Lexi":
163 198
             if not self.ui.comboBox_2.currentIndex():
164
-                htmlSource = conn.request("http://www.cnrtl.fr/%s/%s//%s" %
165
-                                        (form, text, numdef), "GET")
199
+                url = ("http://www.cnrtl.fr/definition/%s//%s" %
200
+                       (self.ui.lineEdit.text(), self.ui.comboBox.currentIndex()))
166 201
             if self.ui.comboBox_2.currentIndex() > 0:
167 202
                 acad = unicode(self.ui.comboBox_2.currentText())
168 203
                 acadnum = filter(lambda x: x.isdigit(), acad)
169 204
                 acadnumf = "academie" + str(acadnum)
170
-                htmlSource = conn.request("http://www.cnrtl.fr/%s/%s/%s//%s" %
171
-                                          (form, acadnumf, text, numdef), "GET")
205
+                url = ("http://www.cnrtl.fr/definition/%s/%s//%s" %
206
+                       (acadnumf, self.ui.lineEdit.text(), self.ui.comboBox.currentIndex()))
207
+        if dico == "Syno":
208
+            url = ("http://www.cnrtl.fr/synonymie/%s" %
209
+                   (self.ui.lineEdit.text()))
210
+        if dico == "Anto":
211
+            url = ("http://www.cnrtl.fr/antonymie/%s"  %
212
+                   (self.ui.lineEdit.text()))
213
+        self.manager.get(QNetworkRequest(QUrl(url)))
214
+        self.loop.exec_()
172 215
 
173
-        else:
174
-            htmlSource = conn.request("http://www.cnrtl.fr/%s/%s" %
175
-                                        (form, text), "GET")
176
-        return htmlSource[1]
177
-        
216
+    def replyFinished(self, reply):
217
+        data = reply.readAll()
218
+        #reply.deleteLater()
219
+        page = QWebPage()
220
+        page.mainFrame().setContent(data)
221
+        webpage = page.mainFrame().documentElement()
222
+        if self.formtype == "definition":
223
+            result = webpage.findAll("div#contentbox")
224
+            if not self.ui.comboBox_2.currentIndex():
225
+                result_to_remove = webpage.findAll("div.tlf_cvedette")
226
+            if 1 <= self.ui.comboBox_2.currentIndex() <= 3:
227
+                result_to_remove = webpage.findAll("span.tlf_cvedette")
228
+            string_to_remove = result_to_remove.first().toInnerXml()
229
+            final_page = result.first().toInnerXml()
230
+            resultf = final_page.replace(string_to_remove, '')
231
+            self.ui.webView.setHtml(resultf)
232
+
233
+            result_box = webpage.findFirst('div#vtoolbar')
234
+            result_test = result_box.findAll("a[href]")
235
+            self.tagform = []
236
+            i = 0
237
+            while i < len(result_test):
238
+                multdef_a = unicode(result_test.at(i).toPlainText())
239
+                # Delete digits in definition title
240
+                multdef_clean = ''.join(c for c in
241
+                    multdef_a if not c.isdigit())
242
+                self.tagform.append(multdef_clean)
243
+                i += 1
244
+        if self.formtype == "synonyme":
245
+            result = webpage.findAll("td.syno_format")
246
+            tag = []
247
+            i = 0
248
+            while i < len(result):
249
+                tag.append(result.at(i).firstChild().toPlainText())
250
+                i += 1
251
+            model = ListModel(tag, self)
252
+            self.ui.listView.setModel(model)
253
+            print "3"
254
+        if self.formtype == "antonyme":
255
+            result = webpage.findAll("td.anto_format")
256
+            tag = []
257
+            i = 0
258
+            while i < len(result):
259
+                tag.append(result.at(i).firstChild().toPlainText())
260
+                i += 1
261
+            model2 = ListModel(tag, self)
262
+            self.ui.listView_2.setModel(model2)
263
+            print "4"
178 264
     # Delete cache directory on close
179
-    def closeEvent(self, event):
180
-        shutil.rmtree('.kcnrtl_cache')
265
+#    def closeEvent(self, event):
266
+#        shutil.rmtree('.kcnrtl_cache')
181 267
         
182 268
 
183 269
 class ListModel(QAbstractListModel):

+ 2
- 2
setup.py View File

@@ -8,14 +8,14 @@ def read(fname):
8 8
 
9 9
 setup(name='KCnrtl',
10 10
       version='0.2b',
11
-      description='KDE graphical client for the CNRTL linguistic resources',
11
+      description='Qt graphical client for the CNRTL french linguistic resources',
12 12
       license = "GPLv3", 
13 13
       author="Bogdan Cordier",
14 14
       author_email="bcord@hadaly.fr",
15 15
       url="http://code.lm7.fr/p/kcnrtl/",
16 16
       download_url="http://code.lm7.fr/p/kcnrtl/downloads/", 
17 17
       packages=['kcnrtl', 'kcnrtl.gui',  'kcnrtl.resources'],
18
-      requires=['httplib2', 'beautifulsoup4'],
18
+      requires=['httplib2', 'beautifulsoup4', 'lxml'],
19 19
       long_description=read('README'),
20 20
       classifiers=[
21 21
         "Development Status :: 4 - Beta",

Loading…
Cancel
Save