diff options
Diffstat (limited to '')
-rw-r--r-- | share/extensions/other/clipart/sources/ocal.py | 97 |
1 files changed, 97 insertions, 0 deletions
diff --git a/share/extensions/other/clipart/sources/ocal.py b/share/extensions/other/clipart/sources/ocal.py new file mode 100644 index 0000000..2fe06db --- /dev/null +++ b/share/extensions/other/clipart/sources/ocal.py @@ -0,0 +1,97 @@ +# +# Copyright 2021 Martin Owens <doctormo@gmail.com> +# +# This program is free software: you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published by +# the Free Software Foundation, either version 3 of the License, or +# (at your option) any later version. +# +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License +# along with this program. If not, see <http://www.gnu.org/licenses/> +# + +import sys +import json +import logging + +from import_sources import RemoteSource, RemoteFile +from urllib.parse import urljoin, parse_qs + +try: + from bs4 import BeautifulSoup +except ImportError: + BeautifulSoup = None + + +class OpenClipartFile(RemoteFile): + def get_file(self): + """Extract search result from html""" + response = self.remote.session.get(self.info["file"]) + soup = BeautifulSoup(response.text, features="lxml") + for script in soup.find_all("script"): + content = script.contents + if content and "image" in content[0]: + try: + data = json.loads(content[0]) + return self.remote.to_local_file(data["image"]["url"]) + except Exception: + continue + logging.error("Couldn't load svg from %s", self.info["file"]) + + +class OpenClipart(RemoteSource): + name = "Open Clipart Library" + icon = "sources/ocal.svg" + base_url = "https://openclipart.org/search/" + is_enabled = BeautifulSoup is not None + file_cls = OpenClipartFile + + def html_search(self, response): + """Extract search results from html""" + soup = BeautifulSoup(response.text, features="lxml") + for div in soup.find_all("div", {"class": "artwork"}): + if div.a and div.a.img: + link = urljoin(self.base_url, div.a.get("href")) + img = urljoin(self.base_url, div.a.img.get("src")) + + yield { + "file": link, # Not the actual file yet (see above) + "name": div.a.img.get("alt"), + "thumbnail": img, + "author": "OpenClipart", + "license": "cc-0", + } + + for page in soup.find_all("a", {"class": "page-link", "aria-label": "Next"}): + if "=" in page.get("href", ""): + yield lambda: self._search(**parse_qs(page.get("href").split("?")[-1])) + + def search(self, query): + """HTML searching for now""" + return self._search(query=query) + + def _search(self, **params): + try: + response = self.session.get(self.base_url, params=params) + except Exception: + return [] + + items = [] + next_page = None + for item in self.html_search(response): + if callable(item): + next_page = item + else: + items.append(item) + # Often ocal will have empty pages, weirdly. + if not items and next_page: + return next_page() + # None empty page, return all + if next_page: + items.append(next_page) + return items |