diff --git a/dev-requirements.txt b/dev-requirements.txt index 6960bd734..87c53a88d 100644 --- a/dev-requirements.txt +++ b/dev-requirements.txt @@ -3,6 +3,7 @@ behave==1.2.5 factory-boy==2.8.1 fake-factory==0.7.2 +httpretty==0.8.14 ipdb ipython pytest-benchmark==3.0.0 diff --git a/setup.py b/setup.py index 2a282b0f3..105110362 100644 --- a/setup.py +++ b/setup.py @@ -59,6 +59,7 @@ 'org.plos = share.transformers.org_plos:PLoSTransformer', 'org.psyarxiv = share.transformers.org_psyarxiv:PsyarxivTransformer', 'org.socarxiv = share.transformers.org_socarxiv:SocarxivTransformer', + 'org.swbiodiversity = share.transformers.org_swbiodiversity:SWTransformer', 'v1_push = share.transformers.v1_push:V1Transformer', ], 'share.harvesters': [ @@ -91,6 +92,7 @@ 'org.ncar = share.harvesters.org_ncar:NCARHarvester', 'org.neurovault = share.harvesters.org_neurovault:NeuroVaultHarvester', 'org.plos = share.harvesters.org_plos:PLOSHarvester', + 'org.swbiodiversity = share.harvesters.org_swbiodiversity:SWHarvester', ] } ) diff --git a/share/harvesters/org_swbiodiversity.py b/share/harvesters/org_swbiodiversity.py new file mode 100644 index 000000000..cee3ac97c --- /dev/null +++ b/share/harvesters/org_swbiodiversity.py @@ -0,0 +1,64 @@ +import itertools +import logging +import re + +from bs4 import BeautifulSoup, Comment +from furl import furl + +from share.harvest import BaseHarvester + + +logger = logging.getLogger(__name__) + + +class SWHarvester(BaseHarvester): + """ + + """ + VERSION = 1 + + def _do_fetch(self, start, end, **kwargs): + end_date = end.date() + start_date = start.date() + logger.info('Harvesting swbiodiversity %s - %s', start_date, end_date) + return self.fetch_records() + + def fetch_records(self): + response = self.requests.get(self.kwargs['list_url']) + response.raise_for_status() + soup = BeautifulSoup(response.content, 'lxml') + records = soup.find_all('a') + + record_list = [] + for record in records: + record_content = re.findall('collid=(\d+)', record.get('href')) + if record_content and record_content[0] not in record_list: + record_list.append(record_content[0]) + total = len(record_list) + + logging.info('Found %d results from swbiodiversity', total) + + for count, identifier in enumerate(record_list): + + logger.info('On collection %d of %d (%d%%)', count, total, (count / total) * 100) + + collection_page = furl(self.kwargs['list_url']) + collection_page.args['collid'] = identifier + response = self.requests.get(collection_page.url) + response.raise_for_status() + + raw_data = BeautifulSoup(response.content, 'html.parser') + # Peel out script tags and css things to minimize size of HTML + for el in itertools.chain( + raw_data('img'), + raw_data('link', rel=('stylesheet', 'dns-prefetch')), + raw_data('link', {'type': re.compile('.')}), + raw_data('noscript'), + raw_data('script'), + raw_data(string=lambda x: isinstance(x, Comment)), + ): + el.extract() + + record = raw_data.find(id='innertext') + + yield identifier, str(record) diff --git a/share/sources/org.swbiodiversity/icon.ico b/share/sources/org.swbiodiversity/icon.ico new file mode 100644 index 000000000..1e660b740 Binary files /dev/null and b/share/sources/org.swbiodiversity/icon.ico differ diff --git a/share/sources/org.swbiodiversity/source.yaml b/share/sources/org.swbiodiversity/source.yaml new file mode 100644 index 000000000..857167e73 --- /dev/null +++ b/share/sources/org.swbiodiversity/source.yaml @@ -0,0 +1,16 @@ +configs: +- base_url: http://swbiodiversity.org/seinet/ + disabled: false + earliest_date: null + harvester: org.swbiodiversity + harvester_kwargs: + list_url: http://swbiodiversity.org/seinet/collections/misc/collprofiles.php + label: org.swbiodiversity + rate_limit_allowance: 1 + rate_limit_period: 2 + transformer: org.swbiodiversity + transformer_kwargs: {} +home_page: http://swbiodiversity.org/seinet/ +long_title: SEINet - Arizona Chapter Collections +name: org.swbiodiversity +user: providers.org.swbiodiversity diff --git a/share/transformers/org_swbiodiversity.py b/share/transformers/org_swbiodiversity.py new file mode 100644 index 000000000..b16512096 --- /dev/null +++ b/share/transformers/org_swbiodiversity.py @@ -0,0 +1,138 @@ +import re + +from bs4 import BeautifulSoup + +from share.transform.chain import ctx +from share.transform.chain import links as tools +from share.transform.chain.parsers import Parser +from share.transform.chain.soup import SoupXMLTransformer + + +class AgentIdentifier(Parser): + uri = tools.IRI(ctx) + + +class WorkIdentifier(Parser): + uri = tools.IRI(ctx) + + +class Organization(Parser): + name = ctx + + +class Publisher(Parser): + agent = tools.Delegate(Organization, ctx) + + +class Institution(Parser): + name = ctx + + +class IsAffiliatedWith(Parser): + related = tools.Delegate(Institution) + + +class Person(Parser): + given_name = tools.ParseName(tools.Try(ctx.name)).first + family_name = tools.ParseName(tools.Try(ctx.name)).last + identifiers = tools.Map(tools.Delegate(AgentIdentifier), tools.Try(ctx.email)) + + +class Creator(Parser): + agent = tools.Delegate(Person, ctx) + + +class Dataset(Parser): + title = tools.Try(ctx['title']) + description = tools.Try(ctx['description']) + + rights = tools.Try( + tools.Join( + tools.Concat( + tools.Try(ctx['access-rights']), + tools.Try(ctx['usage-rights']) + ) + ) + ) + + related_agents = tools.Map(tools.Delegate(Creator), tools.Try(ctx.contact)) + + class Extra: + access_rights = tools.Try(ctx['access-rights']) + usage_rights = tools.Try(ctx['usage-rights']) + collection_statistics = tools.Try(ctx['collection-statistics']) + management = tools.Try(ctx['management']) + collection_type = tools.Try(ctx['collection-type']) + last_update = tools.ParseDate(tools.Try(ctx['last-update'])) + + +class SWTransformer(SoupXMLTransformer): + VERSION = 1 + root_parser = Dataset + + def unwrap_data(self, input_data): + record = BeautifulSoup(input_data, 'lxml').html + data = {} + title = self.extract_text(record.h1) + if title: + data['title'] = title + start = record.div.div + description = self.extract_text(start.find_next()) + if description: + data['description'] = description + if start: + body = start.find_all_next(style='margin-top:5px;') + body = list(map(self.extract_text, body)) + + for entry in body: + + if 'Contact:' in entry: + contact_dict = {} + contact = entry.replace('Contact:', '').strip() + contact_email = contact[contact.find("(") + 1:contact.find(")")] + contact_name = contact.split('(', 1)[0].strip() + if ', Curator' in contact_name: + contact_name = contact_name.replace(', Curator', '').strip() + if contact and contact_email and re.match(r"[^@]+@[^@]+\.[^@]+", contact_email): + contact_dict['email'] = contact_email + if contact_name: + contact_dict['name'] = contact_name + if contact_dict: + data['contact'] = contact_dict + + if 'Collection Type:' in entry: + collection_type = entry.replace('Collection Type: ', '') + data['collection-type'] = collection_type + + if 'Management:' in entry: + management = entry.replace('Management: ', '') + if 'Last Update:' in management: + management_update = management.split('Last Update:', 1) + management = management_update[0] + last_update = management_update[1] + if last_update: + data['last-update'] = last_update.strip() + data['management'] = management.strip() + + if 'Usage Rights:' in entry: + usage_rights = entry.replace('Usage Rights: ', '') + data['usage-rights'] = usage_rights + + if 'Access Rights' in entry or 'Rights Holder:' in entry: + access_rights = entry.replace('Access Rights: ', '').replace('Rights Holder: ', '') + data['access-rights'] = access_rights + + collection_statistics = start.find_all_next('li') + collection_statistics = list(map(self.extract_text, collection_statistics)) + data['collection-statistics'] = self.process_collection_stat(collection_statistics) + return data + + def extract_text(self, text): + return text.text.strip() + + def process_collection_stat(self, list_values): + stat = {} + for item in list_values: + value = item.split() + stat[item.replace(str(value[0]), '').strip()] = value[0] + return stat diff --git a/tests/share/harvesters/test_swbiodiversity_harvester.py b/tests/share/harvesters/test_swbiodiversity_harvester.py new file mode 100644 index 000000000..ebea1eef6 --- /dev/null +++ b/tests/share/harvesters/test_swbiodiversity_harvester.py @@ -0,0 +1,144 @@ +from datetime import timedelta + +from furl import furl +from httpretty import httpretty, httprettified +import pendulum +import pytest + +from share.models import SourceConfig + +main_page = ''' + +
+
+ + + A. Michael Powell Herbarium + ++
+
+ Sample description
+
+ Contact:
+ Test Author (author@email.com)
+
+ |
+
+ + + A. Michael Powell Herbarium + ++
+
+ Sample description
+
+ Contact:
+ Test Author (author@email.com)
+
+ |
+