Browse Source

Refactor Search

- Search engines specifications (search url, parameters, parsing paths) are now in included in a json file.
It should be easier to add or modify search engines without touching to the
code.
- Fixed type error in search.py.
Bogdan Cordier 3 years ago
parent
commit
6b2dd84eaa
3 changed files with 120 additions and 59 deletions
  1. 56
    53
      hadaly/app.py
  2. 58
    0
      hadaly/data/search_engines.json
  3. 6
    6
      hadaly/search.py

+ 56
- 53
hadaly/app.py View File

@@ -19,6 +19,10 @@ from PIL import Image
19 19
 import tarfile
20 20
 
21 21
 from kivy.config import Config
22
+
23
+# Config.set('graphics', 'fullscreen', 'auto')
24
+Config.set('kivy', 'log_level', 'debug')
25
+
22 26
 from kivy.app import App
23 27
 from kivy.core.window import Window
24 28
 from kivy.properties import StringProperty, ListProperty, DictProperty, NumericProperty
@@ -82,7 +86,8 @@ class HadalyApp(App):
82 86
 
83 87
     def on_start(self):
84 88
         self.tempdir = tempfile.mkdtemp()
85
-
89
+        self.presentation['title'] = _('New Title')
90
+        self.engines = json.load(open('hadaly/data/search_engines.json'))
86 91
         try:
87 92
             if argv[1].endswith('.opah'):
88 93
                 self.load_slides(os.path.dirname(argv[1]), [os.path.basename(argv[1])])
@@ -417,80 +422,78 @@ class HadalyApp(App):
417 422
 
418 423
     def search_term(self, term, engine, page):
419 424
 
420
-        engines = {
421
-            'met': ('http://www.metmuseum.org/'
422
-                    'collection/the-collection-online/search'
423
-                    '?ft={term}&ao=on&rpp={rpp}&pg={page}'),
424
-            'getty': ('http://search.getty.edu/'
425
-                      'gateway/search'
426
-                      '?q={term}&cat=highlight&f="Open+Content+Images"&rows={rpp}&srt=&dir=s&dsp=0&img=0&pg={page}')
427
-        }
425
+        params = urllib.parse.urlencode({self.engines[engine]['params']['term']: term,
426
+                            self.engines[engine]['params']['rpp']: self.config.get('search', 'search_rpp'),
427
+                            self.engines[engine]['params']['page']: page})
428 428
 
429
-        url = engines[engine].format(term=term,
430
-                                     rpp=self.config.get('search', 'search_rpp'),
431
-                                     page=page)
432
-        clean_url = urllib.parse.quote(url, safe="%/:=&?~#+!$,;'@()*[]")
429
+        url = ''.join((self.engines[engine]['base_url'], params))
433 430
 
434
-        UrlRequest(clean_url, on_success=self.parse_results, debug=True)
431
+        UrlRequest(url, on_success=self.parse_results, debug=True)
435 432
 
436 433
     def parse_results(self, request, data):
437
-        tree = html.fromstring(data)
438 434
         search_screen = self.root.get_screen('search')
439 435
         results = []
440 436
 
441
-        if 'metmuseum' in urllib.parse.urlparse(request.url).hostname:
442
-
437
+        if self.engines[urllib.parse.urlparse(request.url).hostname]['results']['format'] == 'html':
438
+            tree = html.fromstring(data)
443 439
             try:
444
-                total_pages = tree.xpath(
445
-                    '//li[starts-with(@id, "phcontent_0_phfullwidthcontent_0_paginationWidget_rptPagination_paginationLineItem_")]//a/text()')
446
-                search_screen.box.total_pages = max([int(n) for n in total_pages])
440
+                total_results = re.sub("[^0-9]", "", tree.xpath(self.engines[urllib.parse.urlparse(request.url).hostname]['results']['total_results'])[0])
441
+                if not total_results or int(total_results) == 0:
442
+                    raise ValueError
447 443
             except ValueError:
448 444
                 self.show_popup(_('Error'), _('No results found.'))
449 445
                 return
450 446
 
451
-            objects = tree.xpath('//div[starts-with(@class, "list-view-object ")]')
452
-
453
-            for object in objects:
454
-                result = {}
455
-                result['title'] = object.xpath('./div[@class="list-view-object-info"]/a/div[@class="objtitle"]/text()')[
456
-                    0]
457
-                try:
458
-                    result['artist'] = \
459
-                        object.xpath('./div[@class="list-view-object-info"]/div[@class="artist"]/text()')[::2][0]
460
-                except IndexError:
461
-                    result['artist'] = 'Unknown'
462
-                result['thumb'] = urllib.parse.quote(object.xpath('./div[@class="list-view-thumbnail"]//img/@src')[0],
463
-                                               safe="%/:=&?~#+!$,;'@()*[]")
464
-                object_info = object.xpath('./div[@class="list-view-object-info"]/div[@class="objectinfo"]/text()')
465
-                result['year'] = object_info[0].strip('Dates: ')
466
-                results.append(result)
467
-
468
-        elif 'getty' in urllib.parse.urlparse(request.url).hostname:
447
+            if isinstance(tree.xpath(self.engines[urllib.parse.urlparse(request.url).hostname]['results']['total_pages']), list):
448
+                search_screen.box.total_pages = tree.xpath(self.engines[urllib.parse.urlparse(request.url).hostname]['results']['total_pages'])[0]
449
+            else:
450
+                search_screen.box.total_pages = tree.xpath(self.engines[urllib.parse.urlparse(request.url).hostname]['results']['total_pages'])
451
+
452
+            for entry in tree.xpath(self.engines[urllib.parse.urlparse(request.url).hostname]['results']['entries']):
453
+                artist = entry.xpath(self.engines[urllib.parse.urlparse(request.url).hostname]['results']['artist'])[0]
454
+                title = entry.xpath(self.engines[urllib.parse.urlparse(request.url).hostname]['results']['title'])[0]
455
+                date = entry.xpath(self.engines[urllib.parse.urlparse(request.url).hostname]['results']['date'])[0]
456
+                thumb = entry.xpath(self.engines[urllib.parse.urlparse(request.url).hostname]['results']['thumb'])[0]
457
+                obj_link = entry.xpath(self.engines[urllib.parse.urlparse(request.url).hostname]['results']['obj_link'])[0]
458
+
459
+                results.append({'title': title,
460
+                                'artist': artist,
461
+                                'year': date,
462
+                                'thumb': urllib.parse.quote(thumb, safe="%/:=&?~#+!$,;'@()*[]"),
463
+                                'obj_link': obj_link}
464
+                               )
465
+
466
+        elif self.engines[urllib.parse.urlparse(request.url).hostname]['results']['format'] == 'json':
467
+            from ast import literal_eval
468
+            from functools import reduce
469 469
 
470 470
             try:
471
-                search_screen.box.total_pages = tree.xpath('//td[@class="cs-page"]//input/@count')[0]
472
-                if int(re.sub("[^0-9]", "", tree.xpath('//strong[@id="cs-results-count"]//text()')[0])) == 0:
471
+                total_results = reduce(dict.__getitem__, literal_eval(self.engines[urllib.parse.urlparse(request.url).hostname]['results']['total_results']), data)
472
+                if int(total_results) == 0:
473 473
                     raise ValueError
474 474
             except ValueError:
475 475
                 self.show_popup(_('Error'), _('No results found.'))
476 476
                 return
477 477
 
478
-            artists = tree.xpath(
479
-                '//div[@class="cs-result-data-brief"]//td[* = "Creator:" or * = "Maker Name:"]/following-sibling::td[1]/p/text()')
480
-            titles = tree.xpath(
481
-                '//div[@class="cs-result-data-brief"]//td[* = "Title:" or * = "Primary Title:"]/following-sibling::td[1]/p[@class="cs-record-link"]/a/strong/text()')
482
-            dates = tree.xpath(
483
-                '//div[@class="cs-result-data-brief"]//td[* = "Date:"]/following-sibling::td[1]/p/text()')
484
-            thumb = tree.xpath('//img[@class="cs-result-thumbnail"]/@src')
485
-            obj_link = tree.xpath(
486
-                '//div[@class="cs-result-data-brief"]//td[* = "Title:" or * = "Primary Title:"]/following-sibling::td[1]/p[@class="cs-record-link"]/a/@href')
478
+            entries = reduce(dict.__getitem__, literal_eval(self.engines[urllib.parse.urlparse(request.url).hostname]['results']['entries']), data)
479
+            search_screen.box.total_pages = int(total_results / len(entries))
480
+
481
+            for entry in entries:
482
+                artist = reduce(dict.__getitem__, literal_eval(self.engines[urllib.parse.urlparse(request.url).hostname]['results']['artist']), entry)
483
+                title = reduce(dict.__getitem__, literal_eval(self.engines[urllib.parse.urlparse(request.url).hostname]['results']['title']), entry)
484
+                date = reduce(dict.__getitem__, literal_eval(self.engines[urllib.parse.urlparse(request.url).hostname]['results']['date']), entry)
485
+                thumb = reduce(dict.__getitem__, literal_eval(self.engines[urllib.parse.urlparse(request.url).hostname]['results']['thumb']), entry)
486
+                obj_link = reduce(dict.__getitem__, literal_eval(self.engines[urllib.parse.urlparse(request.url).hostname]['results']['obj_link']), entry)
487 487
 
488
-            results = [{'title': a, 'artist': b, 'year': c, 'thumb': urllib.parse.quote(d, safe="%/:=&?~#+!$,;'@()*[]"),
489
-                        'obj_link': e}
490
-                       for a, b, c, d, e in zip(titles, artists, dates, thumb, obj_link)]
488
+                results.append({'title': title,
489
+                                'artist': artist,
490
+                                'year': date,
491
+                                'thumb': urllib.parse.quote(thumb, safe="%/:=&?~#+!$,;'@()*[]"),
492
+                                'obj_link': obj_link}
493
+                               )
491 494
 
492 495
         for photo in results:
493
-            Logger.debug('Search (MET): Loading {url}'.format(url=photo['thumb']))
496
+            Logger.debug('Search : Loading {url}'.format(url=photo['thumb']))
494 497
             image = ItemButton(photo=photo, source=photo['thumb'], keep_ratio=True)
495 498
             search_screen.box.grid.add_widget(image)
496 499
 

+ 58
- 0
hadaly/data/search_engines.json View File

@@ -0,0 +1,58 @@
1
+{
2
+  "www.metmuseum.org": {
3
+    "base_url": "http://www.metmuseum.org/api/collection/collectionlisting?showOnly=withImage&",
4
+    "params": {
5
+      "rpp": "perPage",
6
+      "artist": "artist",
7
+      "page": "page",
8
+      "term": "q",
9
+      "sort": "sortBy",
10
+      "sortings": {
11
+        "relevance": "Relevance"
12
+      },
13
+      "order": "sortOrder",
14
+      "orderings": {
15
+        "asc": "asc"
16
+      }
17
+    },
18
+    "results": {
19
+      "format": "json",
20
+      "entries": "('results',)",
21
+      "total_results": "('totalResults',)",
22
+      "title": "('title',)",
23
+      "artist": "('description',)",
24
+      "date": "('date',)",
25
+      "thumb": "('image',)",
26
+      "obj_link": "('largeImage',)"
27
+    }
28
+  },
29
+  "search.getty.edu": {
30
+    "base_url": "http://search.getty.edu/gateway/search?cat=highlight&f=\"Open+Content+Images\"&dir=s&img=1&dsp=0&",
31
+    "params": {
32
+      "rpp": "rows",
33
+      "artist": "artist",
34
+      "page": "pg",
35
+      "term": "q",
36
+      "sort": "sortBy",
37
+      "sortings": {
38
+        "relevance": "Relevance"
39
+      },
40
+      "order": "sortOrder",
41
+      "orderings": {
42
+        "asc": "asc"
43
+      }
44
+    },
45
+    "results": {
46
+      "format": "html",
47
+      "entries": "//div[@class=\"cs-result-item\"]",
48
+      "title": ".//div[@class=\"cs-result-data-brief\"]//td[* = \"Title:\" or * = \"Primary Title:\"]/following-sibling::td[1]/p[@class=\"cs-record-link\"]/a/strong/text()",
49
+      "artist": ".//div[@class=\"cs-result-data-brief\"]//td[* = \"Creator:\" or * = \"Maker Name:\"]/following-sibling::td[1]/p/text()",
50
+      "date": ".//div[@class=\"cs-result-data-brief\"]//td[* = \"Date:\"]/following-sibling::td[1]/p/text()",
51
+      "thumb": ".//img[@class=\"cs-result-thumbnail\"]/@src",
52
+      "obj_link": ".//div[@class=\"cs-result-data-brief\"]//td[* = \"Title:\" or * = \"Primary Title:\"]/following-sibling::td[1]/p[@class=\"cs-record-link\"]/a/@href",
53
+      "total_pages": "//td[@class=\"cs-page\"]//input/@count",
54
+      "total_results": "//strong[@id=\"cs-results-count\"]//text()"
55
+    }
56
+  }
57
+
58
+}

+ 6
- 6
hadaly/search.py View File

@@ -32,8 +32,8 @@ class SearchBox(BoxLayout):
32 32
         Logger.debug('Search: requesting {term} from {provider}'.format(term=term,
33 33
                                                                         provider=provider))
34 34
         self.grid.clear_widgets()
35
-        providers = {'MetMuseum': 'met',
36
-                     'Getty OCI': 'getty'}
35
+        providers = {'MetMuseum': 'www.metmuseum.org',
36
+                     'Getty OCI': 'search.getty.edu'}
37 37
         if not term:
38 38
             popup = Popup(title=_('Error'), size_hint=(0.3, 0.2))
39 39
             popup.add_widget(Label(text=_('Please enter a search term.')))
@@ -58,7 +58,7 @@ class SearchBox(BoxLayout):
58 58
             self.app.search_term(term, providers[provider], self.current_page)
59 59
 
60 60
     def search_next(self, text, provider):
61
-        if self.current_page < self.total_pages:
61
+        if self.current_page < int(self.total_pages):
62 62
             self.current_page += 1
63 63
             self.search(text, provider)
64 64
 
@@ -105,7 +105,7 @@ class SearchItemInfo(Popup):
105 105
 
106 106
         if self.provider == 'MET':
107 107
             # Check if high-res is available.
108
-            url = self.photo['thumb'].replace('web-thumb', 'original')
108
+            url = self.photo['thumb'].replace('mobile-large', 'original')
109 109
             req = urllib.request.urlopen(url)
110 110
             if req.getcode() == 404:
111 111
                 Logger.debug('Search: High-res image not available.')
@@ -120,9 +120,9 @@ class SearchItemInfo(Popup):
120 120
 
121 121
             req = urllib.request.urlopen(self.photo['obj_link'])
122 122
             tree = html.parse(req)
123
-            img_link = tree.xpath('//div[@class="cs-result-image"]//span[@class="nav"]/a/@href')
123
+            img_link = tree.xpath('//a[@id="download-open-content"]/@href')
124 124
             if img_link:
125
-                link = urllib.parse.urlparse(tree.xpath('//div[@class="cs-result-image"]//span[@class="nav"]/a/@href')[0])
125
+                link = urllib.parse.urlparse(img_link[0])
126 126
                 url = urllib.parse.parse_qs(link.query)['dlimgurl'][0]
127 127
             else:
128 128
                 url = self.photo['thumb']

Loading…
Cancel
Save