Spaces:
Running on Zero
Running on Zero
Migrate OpenAlex integration off PyAlex
Browse files- .gitignore +3 -0
- OPENALEX_API_MIGRATION_PROPOSAL.md +657 -0
- app.py +48 -196
- config_loader.py +34 -0
- openalex_client.py +369 -0
- openalex_config.example.json +3 -0
- openalex_utils.py +186 -341
- requirements.txt +0 -1
.gitignore
CHANGED
|
@@ -6,6 +6,8 @@
|
|
| 6 |
!*.py
|
| 7 |
!requirements.txt
|
| 8 |
!README.md
|
|
|
|
|
|
|
| 9 |
|
| 10 |
# Even if they are in subdirectories
|
| 11 |
!*/
|
|
@@ -43,3 +45,4 @@ static/
|
|
| 43 |
|
| 44 |
app_save_copy.py
|
| 45 |
app_2.py
|
|
|
|
|
|
| 6 |
!*.py
|
| 7 |
!requirements.txt
|
| 8 |
!README.md
|
| 9 |
+
!OPENALEX_API_MIGRATION_PROPOSAL.md
|
| 10 |
+
!openalex_config.example.json
|
| 11 |
|
| 12 |
# Even if they are in subdirectories
|
| 13 |
!*/
|
|
|
|
| 45 |
|
| 46 |
app_save_copy.py
|
| 47 |
app_2.py
|
| 48 |
+
openalex_config.local.json
|
OPENALEX_API_MIGRATION_PROPOSAL.md
ADDED
|
@@ -0,0 +1,657 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# Proposal: OpenAlex API Migration for OpenAlex Mapper
|
| 2 |
+
|
| 3 |
+
Date: 2026-03-12
|
| 4 |
+
|
| 5 |
+
## Executive summary
|
| 6 |
+
|
| 7 |
+
The repository does not need a UI redesign. The plotting, embedding, CSV upload, and downstream record processing can stay largely as they are. The brittle part is the OpenAlex transport/query layer.
|
| 8 |
+
|
| 9 |
+
The current code:
|
| 10 |
+
|
| 11 |
+
- accepts OpenAlex website URLs and turns them into PyAlex queries in [`openalex_utils.py`](./openalex_utils.py)
|
| 12 |
+
- authenticates with `pyalex.config.email` in [`app.py`](./app.py)
|
| 13 |
+
- relies on page-based pagination and `query.count()` in [`app.py`](./app.py)
|
| 14 |
+
- still understands deprecated filters like `default.search`, `title_and_abstract.search`, and `host_venue.id`
|
| 15 |
+
|
| 16 |
+
That is no longer a safe contract with the current OpenAlex API.
|
| 17 |
+
|
| 18 |
+
My recommendation is:
|
| 19 |
+
|
| 20 |
+
1. Keep the user-facing interface exactly as it is.
|
| 21 |
+
2. Introduce a repository-owned OpenAlex compatibility layer.
|
| 22 |
+
3. Move the main list-fetching path from "URL -> PyAlex DSL" to "URL -> normalized API params -> direct HTTP client".
|
| 23 |
+
4. Keep PyAlex 0.21 only where it adds value during transition, or remove it from the hot path entirely.
|
| 24 |
+
|
| 25 |
+
This is the lowest-risk way to keep old pasted OpenAlex URLs working while aligning with the current API.
|
| 26 |
+
|
| 27 |
+
## What the repository is doing now
|
| 28 |
+
|
| 29 |
+
### Current integration points
|
| 30 |
+
|
| 31 |
+
- [`app.py`](./app.py#L197) sets `pyalex.config.email`, not an API key.
|
| 32 |
+
- [`openalex_utils.py`](./openalex_utils.py#L7) parses an OpenAlex URL into a `Works()` query.
|
| 33 |
+
- [`openalex_utils.py`](./openalex_utils.py#L24) splits `filter=` on commas and maps `default.search` to `.search(...)`.
|
| 34 |
+
- [`app.py`](./app.py#L480) calls `query.count()` before fetching.
|
| 35 |
+
- [`app.py`](./app.py#L531), [`app.py`](./app.py#L558), [`app.py`](./app.py#L577), and [`app.py`](./app.py#L614) paginate with `method='page'`.
|
| 36 |
+
- [`openalex_utils.py`](./openalex_utils.py#L212) resolves DOI CSV uploads with `Works().filter(doi=doi_str).get(...)`.
|
| 37 |
+
- [`openalex_utils.py`](./openalex_utils.py#L218) generates readable query labels and still special-cases `host_venue.id` and `concepts.id`.
|
| 38 |
+
|
| 39 |
+
### Important observation
|
| 40 |
+
|
| 41 |
+
The downstream record model is mostly still compatible with current OpenAlex work responses:
|
| 42 |
+
|
| 43 |
+
- `primary_location`
|
| 44 |
+
- `primary_topic`
|
| 45 |
+
- `abstract_inverted_index`
|
| 46 |
+
- `referenced_works`
|
| 47 |
+
- `title`
|
| 48 |
+
|
| 49 |
+
That means the migration can be focused on query normalization, authentication, pagination, and transport. The plotting pipeline does not need to change shape.
|
| 50 |
+
|
| 51 |
+
## Current OpenAlex state that matters for this repo
|
| 52 |
+
|
| 53 |
+
### 1. Authentication has changed
|
| 54 |
+
|
| 55 |
+
OpenAlex now documents API-key-based access and credit-based billing. The older "polite pool via email" approach is no longer the right production model.
|
| 56 |
+
|
| 57 |
+
Impact on this repo:
|
| 58 |
+
|
| 59 |
+
- [`app.py`](./app.py#L197) is configured for the old model.
|
| 60 |
+
- The Hugging Face app should assume `OPENALEX_API_KEY` is required.
|
| 61 |
+
- `pyalex.config.email` is no longer enough as the primary auth strategy.
|
| 62 |
+
|
| 63 |
+
Recommended response:
|
| 64 |
+
|
| 65 |
+
- add `OPENALEX_API_KEY`
|
| 66 |
+
- configure the client with the API key
|
| 67 |
+
- keep `email` only as optional metadata, not as the core auth mechanism
|
| 68 |
+
|
| 69 |
+
### 2. Page-based pagination is only safe for shallow result sets
|
| 70 |
+
|
| 71 |
+
OpenAlex now requires cursor pagination beyond 10,000 results. I verified this live on 2026-03-12:
|
| 72 |
+
|
| 73 |
+
- `https://api.openalex.org/works?filter=publication_year:2024&page=101&per_page=100`
|
| 74 |
+
- response: `Pagination error. Maximum results size of 10,000 records is exceeded. Cursor pagination is required for records beyond 10,000.`
|
| 75 |
+
|
| 76 |
+
Impact on this repo:
|
| 77 |
+
|
| 78 |
+
- the "All" path in [`app.py`](./app.py#L605)
|
| 79 |
+
- the "First n samples" path in [`app.py`](./app.py#L605)
|
| 80 |
+
- the random-sampling fallback in [`app.py`](./app.py#L572)
|
| 81 |
+
|
| 82 |
+
All three currently rely on page-based traversal and can fail on large queries.
|
| 83 |
+
|
| 84 |
+
Recommended response:
|
| 85 |
+
|
| 86 |
+
- use cursor pagination for all full-download paths
|
| 87 |
+
- reserve page pagination only for explicitly shallow requests, or stop using it entirely
|
| 88 |
+
|
| 89 |
+
### 3. Deprecated search/filter names are still in the code
|
| 90 |
+
|
| 91 |
+
Current OpenAlex docs mark old search/filter surfaces as deprecated, including:
|
| 92 |
+
|
| 93 |
+
- `default.search`
|
| 94 |
+
- `title_and_abstract.search`
|
| 95 |
+
- `host_venue`
|
| 96 |
+
- `alternate_host_venues`
|
| 97 |
+
- `x_concepts`
|
| 98 |
+
- Concepts as the old classification system
|
| 99 |
+
|
| 100 |
+
Impact on this repo:
|
| 101 |
+
|
| 102 |
+
- [`openalex_utils.py`](./openalex_utils.py#L29) still maps `default.search`
|
| 103 |
+
- [`openalex_utils.py`](./openalex_utils.py#L261) still understands `title_and_abstract.search`
|
| 104 |
+
- [`openalex_utils.py`](./openalex_utils.py#L340) still special-cases `host_venue.id`
|
| 105 |
+
- [`openalex_utils.py`](./openalex_utils.py#L350) still treats `concepts.id` as a first-class query label case
|
| 106 |
+
|
| 107 |
+
### 4. `host_venue.id` is not just deprecated; it already breaks
|
| 108 |
+
|
| 109 |
+
I verified this live on 2026-03-12:
|
| 110 |
+
|
| 111 |
+
- `https://api.openalex.org/works?filter=host_venue.id:S125754415&per_page=1`
|
| 112 |
+
- response: `Invalid query parameters error. host_venue and alternate_host_venues are deprecated in favor of locations.`
|
| 113 |
+
|
| 114 |
+
The replacement works:
|
| 115 |
+
|
| 116 |
+
- `https://api.openalex.org/works?filter=primary_location.source.id:S125754415&per_page=1`
|
| 117 |
+
|
| 118 |
+
Impact on this repo:
|
| 119 |
+
|
| 120 |
+
- old user-pasted URLs containing `host_venue.id` will fail today
|
| 121 |
+
|
| 122 |
+
Recommended response:
|
| 123 |
+
|
| 124 |
+
- normalize `host_venue.id -> primary_location.source.id`
|
| 125 |
+
- normalize `alternate_host_venues.id -> locations.source.id`
|
| 126 |
+
|
| 127 |
+
### 5. Concepts are legacy; Topics are the current taxonomy
|
| 128 |
+
|
| 129 |
+
OpenAlex responses still include `concepts`, and `concepts.id` still works today, but OpenAlex now treats Topics as the current classification system.
|
| 130 |
+
|
| 131 |
+
Impact on this repo:
|
| 132 |
+
|
| 133 |
+
- the app itself mostly uses `primary_topic`, which is good
|
| 134 |
+
- old user URLs may still use `concepts.id`
|
| 135 |
+
|
| 136 |
+
Recommended response:
|
| 137 |
+
|
| 138 |
+
- do not break old `concepts.id` URLs immediately
|
| 139 |
+
- treat them as legacy pass-through
|
| 140 |
+
- update labels, examples, and new code to prefer Topics
|
| 141 |
+
- do not try to auto-convert Concept IDs to Topic IDs; that is not a clean one-to-one migration
|
| 142 |
+
|
| 143 |
+
### 6. `search=` should be the canonical search input
|
| 144 |
+
|
| 145 |
+
Current docs prefer the top-level `search=` parameter and current field-specific search filters over `default.search` and `title_and_abstract.search`.
|
| 146 |
+
|
| 147 |
+
Impact on this repo:
|
| 148 |
+
|
| 149 |
+
- the parser should accept old URLs
|
| 150 |
+
- internal canonical form should use current search syntax
|
| 151 |
+
|
| 152 |
+
Recommended response:
|
| 153 |
+
|
| 154 |
+
- normalize `filter=default.search:...` into `search=...`
|
| 155 |
+
- keep `title_and_abstract.search` accepted for legacy compatibility
|
| 156 |
+
- internally prefer `search`, `title.search`, `abstract.search`, and `fulltext.search`
|
| 157 |
+
|
| 158 |
+
### 7. XPAC is now an opt-in corpus extension
|
| 159 |
+
|
| 160 |
+
OpenAlex supports `include_xpac=true` to include the extended paper corpus.
|
| 161 |
+
|
| 162 |
+
Impact on this repo:
|
| 163 |
+
|
| 164 |
+
- enabling XPAC changes result sets
|
| 165 |
+
- that would alter the app's semantics without the user asking for it
|
| 166 |
+
|
| 167 |
+
Recommended response:
|
| 168 |
+
|
| 169 |
+
- explicitly keep `include_xpac=false` or omit it
|
| 170 |
+
- do not change corpus scope if the requirement is "keep the interface working as is"
|
| 171 |
+
|
| 172 |
+
### 8. The current API still returns both `title` and `display_name`
|
| 173 |
+
|
| 174 |
+
Live work responses currently include both `title` and `display_name`.
|
| 175 |
+
|
| 176 |
+
Impact on this repo:
|
| 177 |
+
|
| 178 |
+
- existing downstream code using `title` still works
|
| 179 |
+
- we should still normalize defensively in case OpenAlex eventually removes one duplicate
|
| 180 |
+
|
| 181 |
+
Recommended response:
|
| 182 |
+
|
| 183 |
+
- set `record["title"] = record.get("title") or record.get("display_name") or " "`
|
| 184 |
+
|
| 185 |
+
### 9. The docs and runtime are not perfectly aligned
|
| 186 |
+
|
| 187 |
+
Two examples from today:
|
| 188 |
+
|
| 189 |
+
- docs emphasize API keys, but unauthenticated requests still returned `200` from this environment
|
| 190 |
+
- docs describe `per_page` max 100, but the live API still accepted `per_page=200`
|
| 191 |
+
|
| 192 |
+
That should not reassure us. It means undocumented compatibility still exists, not that it is safe to rely on.
|
| 193 |
+
|
| 194 |
+
Recommended response:
|
| 195 |
+
|
| 196 |
+
- code to the documented contract, not the current accidental tolerance
|
| 197 |
+
|
| 198 |
+
## Current PyAlex state
|
| 199 |
+
|
| 200 |
+
### Version and maintenance surface
|
| 201 |
+
|
| 202 |
+
PyAlex on PyPI is currently at 0.21, uploaded 2026-02-23.
|
| 203 |
+
|
| 204 |
+
From its current package metadata/README, PyAlex supports:
|
| 205 |
+
|
| 206 |
+
- API key configuration
|
| 207 |
+
- select fields
|
| 208 |
+
- sample
|
| 209 |
+
- pagination
|
| 210 |
+
- OR filters
|
| 211 |
+
- search filters
|
| 212 |
+
- semantic search
|
| 213 |
+
|
| 214 |
+
It also still exposes deprecated OpenAlex surfaces such as:
|
| 215 |
+
|
| 216 |
+
- Concepts
|
| 217 |
+
- N-grams
|
| 218 |
+
- older search/filter patterns for compatibility
|
| 219 |
+
|
| 220 |
+
### What that means for this repo
|
| 221 |
+
|
| 222 |
+
PyAlex is not the main problem. The current repo problem is that we are translating pasted OpenAlex URLs into PyAlex calls with our own fragile parser.
|
| 223 |
+
|
| 224 |
+
PyAlex is still viable if:
|
| 225 |
+
|
| 226 |
+
- we pin it
|
| 227 |
+
- we stop letting UI code depend directly on PyAlex query construction
|
| 228 |
+
- we put a repository-owned adapter in front of it
|
| 229 |
+
|
| 230 |
+
### My recommendation on PyAlex
|
| 231 |
+
|
| 232 |
+
I would not use PyAlex as the primary transport layer for list fetching anymore.
|
| 233 |
+
|
| 234 |
+
Reason:
|
| 235 |
+
|
| 236 |
+
- the app's input contract is raw OpenAlex URLs
|
| 237 |
+
- OpenAlex's current API surface is evolving
|
| 238 |
+
- URL normalization is simpler and more faithful if we keep requests as HTTP params instead of forcing them through a Python query DSL
|
| 239 |
+
|
| 240 |
+
Recommended compromise:
|
| 241 |
+
|
| 242 |
+
- use direct HTTP for list/sampling/pagination
|
| 243 |
+
- keep PyAlex temporarily for singleton lookups if convenient
|
| 244 |
+
- pin `pyalex>=0.21,<0.22` while that transition is happening
|
| 245 |
+
|
| 246 |
+
If you want the smallest possible dependency surface, PyAlex can be removed entirely later.
|
| 247 |
+
|
| 248 |
+
## Proposed migration design
|
| 249 |
+
|
| 250 |
+
## Goal
|
| 251 |
+
|
| 252 |
+
Keep all current user-facing behavior:
|
| 253 |
+
|
| 254 |
+
- same Gradio controls
|
| 255 |
+
- same textbox contract: paste OpenAlex URLs
|
| 256 |
+
- same semicolon-separated multiple-query input
|
| 257 |
+
- same sample-size controls
|
| 258 |
+
- same uploaded CSV behavior
|
| 259 |
+
- same output plot and downloadable CSV shape
|
| 260 |
+
|
| 261 |
+
Only the backend fetch layer changes.
|
| 262 |
+
|
| 263 |
+
## Proposed architecture
|
| 264 |
+
|
| 265 |
+
### 1. Add a repository-owned compatibility layer
|
| 266 |
+
|
| 267 |
+
Create a new module, for example `openalex_client.py`, responsible for:
|
| 268 |
+
|
| 269 |
+
- auth
|
| 270 |
+
- retries and backoff
|
| 271 |
+
- current API parameter names
|
| 272 |
+
- cursor pagination
|
| 273 |
+
- deterministic random sampling
|
| 274 |
+
- field selection
|
| 275 |
+
- DOI batch resolution
|
| 276 |
+
- singleton lookups for query labels
|
| 277 |
+
|
| 278 |
+
This module should be the only place that knows how to talk to OpenAlex.
|
| 279 |
+
|
| 280 |
+
### 2. Add URL normalization before any network call
|
| 281 |
+
|
| 282 |
+
Create a normalization step, either in a new file like `openalex_query.py` or by refactoring [`openalex_utils.py`](./openalex_utils.py).
|
| 283 |
+
|
| 284 |
+
Responsibilities:
|
| 285 |
+
|
| 286 |
+
- accept both `openalex.org/...` and `api.openalex.org/...`
|
| 287 |
+
- preserve semicolon-separated multi-query input
|
| 288 |
+
- parse query params without lossy comma splitting
|
| 289 |
+
- produce a canonical internal representation
|
| 290 |
+
|
| 291 |
+
Canonicalization rules should include:
|
| 292 |
+
|
| 293 |
+
- `default.search -> search`
|
| 294 |
+
- `host_venue.id -> primary_location.source.id`
|
| 295 |
+
- `alternate_host_venues.id -> locations.source.id`
|
| 296 |
+
- `per-page -> per_page`
|
| 297 |
+
- `api_key` stripped from user input and sourced only from environment
|
| 298 |
+
|
| 299 |
+
Legacy rules:
|
| 300 |
+
|
| 301 |
+
- `concepts.id` stays accepted as legacy
|
| 302 |
+
- `title_and_abstract.search` stays accepted, but is marked legacy in code and tests
|
| 303 |
+
|
| 304 |
+
### 3. Fetch lists through direct HTTP, not PyAlex query objects
|
| 305 |
+
|
| 306 |
+
Use `requests`, which the repo already depends on.
|
| 307 |
+
|
| 308 |
+
For list fetching:
|
| 309 |
+
|
| 310 |
+
- build API URLs/params directly
|
| 311 |
+
- use `cursor=*` for deep pagination
|
| 312 |
+
- use `select=` to minimize payload
|
| 313 |
+
- centralize retry and rate-limit handling
|
| 314 |
+
|
| 315 |
+
This avoids the current failure mode where a user URL must survive:
|
| 316 |
+
|
| 317 |
+
`OpenAlex URL -> custom parser -> PyAlex DSL -> OpenAlex request`
|
| 318 |
+
|
| 319 |
+
Instead it becomes:
|
| 320 |
+
|
| 321 |
+
`OpenAlex URL -> normalize -> OpenAlex request`
|
| 322 |
+
|
| 323 |
+
That is simpler and less fragile.
|
| 324 |
+
|
| 325 |
+
### 4. Keep output records in the existing shape
|
| 326 |
+
|
| 327 |
+
The client should normalize each work record into the shape expected by the rest of the app:
|
| 328 |
+
|
| 329 |
+
- `id`
|
| 330 |
+
- `title`
|
| 331 |
+
- `doi`
|
| 332 |
+
- `publication_year`
|
| 333 |
+
- `abstract_inverted_index`
|
| 334 |
+
- `primary_location`
|
| 335 |
+
- `primary_topic`
|
| 336 |
+
- `referenced_works`
|
| 337 |
+
|
| 338 |
+
Normalization defaults:
|
| 339 |
+
|
| 340 |
+
- `title = title or display_name or " "`
|
| 341 |
+
- `abstract_inverted_index = {}` or `None`, handled safely by existing abstract reconstruction
|
| 342 |
+
- `referenced_works = []`
|
| 343 |
+
- `primary_location = None`
|
| 344 |
+
- `primary_topic = None`
|
| 345 |
+
|
| 346 |
+
This preserves the interface between the fetch layer and the plotting layer.
|
| 347 |
+
|
| 348 |
+
## Detailed implementation plan
|
| 349 |
+
|
| 350 |
+
### Phase 1: auth and transport
|
| 351 |
+
|
| 352 |
+
Changes:
|
| 353 |
+
|
| 354 |
+
- replace [`app.py`](./app.py#L197) `pyalex.config.email = ...`
|
| 355 |
+
- read `OPENALEX_API_KEY` from the environment
|
| 356 |
+
- initialize a shared OpenAlex client with retries, timeout, and a descriptive user agent
|
| 357 |
+
|
| 358 |
+
If no API key is present:
|
| 359 |
+
|
| 360 |
+
- local development can warn and continue
|
| 361 |
+
- deployed environments should fail loudly at startup
|
| 362 |
+
|
| 363 |
+
This is the first change I would make because it affects every query path.
|
| 364 |
+
|
| 365 |
+
### Phase 2: URL normalization
|
| 366 |
+
|
| 367 |
+
Refactor [`openalex_utils.py`](./openalex_utils.py#L7).
|
| 368 |
+
|
| 369 |
+
Replace `openalex_url_to_pyalex_query(url)` with something like:
|
| 370 |
+
|
| 371 |
+
- `parse_openalex_input_url(url) -> ParsedQuery`
|
| 372 |
+
- `normalize_openalex_query(parsed) -> CanonicalQuery`
|
| 373 |
+
|
| 374 |
+
Important note:
|
| 375 |
+
|
| 376 |
+
The current parser does `query_params['filter'][0].split(',')`.
|
| 377 |
+
|
| 378 |
+
That is unsafe for any filter value that legitimately contains commas after URL decoding. It is also the wrong foundation for long-term compatibility.
|
| 379 |
+
|
| 380 |
+
Use either:
|
| 381 |
+
|
| 382 |
+
- a small filter tokenizer that respects quoted values
|
| 383 |
+
- or a normalization strategy that manipulates the raw filter string and only tokenizes when necessary
|
| 384 |
+
|
| 385 |
+
### Phase 3: list fetching
|
| 386 |
+
|
| 387 |
+
Replace the fetch logic in [`app.py`](./app.py#L477).
|
| 388 |
+
|
| 389 |
+
#### For "All"
|
| 390 |
+
|
| 391 |
+
- use cursor pagination until exhausted
|
| 392 |
+
- stop relying on `query.count()` for control flow
|
| 393 |
+
|
| 394 |
+
#### For "First n samples"
|
| 395 |
+
|
| 396 |
+
- use cursor pagination
|
| 397 |
+
- stop once `n` records have been collected
|
| 398 |
+
|
| 399 |
+
This preserves visible behavior while avoiding the 10,000-record page limit.
|
| 400 |
+
|
| 401 |
+
#### For "n random samples"
|
| 402 |
+
|
| 403 |
+
Use two modes:
|
| 404 |
+
|
| 405 |
+
- if `n <= 10000`, use OpenAlex sampling with `sample=n&seed=...`
|
| 406 |
+
- if `n > 10000`, use cursor pagination plus deterministic reservoir sampling locally
|
| 407 |
+
|
| 408 |
+
I would not keep the current "repeat `sample()` with different seeds and dedupe" strategy as the long-term design. It is workable, but it is not the cleanest statistical contract.
|
| 409 |
+
|
| 410 |
+
### Phase 4: select only required fields
|
| 411 |
+
|
| 412 |
+
Current list fetches pull full work records.
|
| 413 |
+
|
| 414 |
+
The app only needs a subset for the main pipeline. Use `select=` with something close to:
|
| 415 |
+
|
| 416 |
+
- `id`
|
| 417 |
+
- `title`
|
| 418 |
+
- `display_name`
|
| 419 |
+
- `doi`
|
| 420 |
+
- `publication_year`
|
| 421 |
+
- `abstract_inverted_index`
|
| 422 |
+
- `primary_location`
|
| 423 |
+
- `primary_topic`
|
| 424 |
+
- `referenced_works`
|
| 425 |
+
|
| 426 |
+
Benefits:
|
| 427 |
+
|
| 428 |
+
- less bandwidth
|
| 429 |
+
- lower latency
|
| 430 |
+
- lower credit usage
|
| 431 |
+
- less memory pressure in the HF Space
|
| 432 |
+
|
| 433 |
+
### Phase 5: rewrite readable-name helpers
|
| 434 |
+
|
| 435 |
+
Refactor [`openalex_utils.py`](./openalex_utils.py#L218).
|
| 436 |
+
|
| 437 |
+
The current readable-name path should stop assuming:
|
| 438 |
+
|
| 439 |
+
- `host_venue.id`
|
| 440 |
+
- Concept-first terminology
|
| 441 |
+
- direct PyAlex singleton calls from arbitrary old filter names
|
| 442 |
+
|
| 443 |
+
New logic:
|
| 444 |
+
|
| 445 |
+
- build labels from the normalized query
|
| 446 |
+
- resolve singleton IDs through the shared client
|
| 447 |
+
- cache author/institution/work lookups in memory
|
| 448 |
+
|
| 449 |
+
That prevents repeated network calls when the same query name is rendered multiple times.
|
| 450 |
+
|
| 451 |
+
### Phase 6: DOI upload path
|
| 452 |
+
|
| 453 |
+
Refactor [`openalex_utils.py`](./openalex_utils.py#L196).
|
| 454 |
+
|
| 455 |
+
The DOI upload flow is still conceptually fine, but it should be moved into the shared client.
|
| 456 |
+
|
| 457 |
+
Recommended behavior:
|
| 458 |
+
|
| 459 |
+
- batch DOI OR-filters up to OpenAlex's supported set size
|
| 460 |
+
- enforce URL-length limits
|
| 461 |
+
- reuse the same normalization, retry, and `select=` logic
|
| 462 |
+
|
| 463 |
+
This keeps uploaded DOI CSVs working exactly as they do now.
|
| 464 |
+
|
| 465 |
+
### Phase 7: defensive record normalization
|
| 466 |
+
|
| 467 |
+
Refactor [`process_records_to_df`](./openalex_utils.py#L98) only lightly.
|
| 468 |
+
|
| 469 |
+
Keep the function, but make it more explicit about current schema tolerances:
|
| 470 |
+
|
| 471 |
+
- missing `primary_location`
|
| 472 |
+
- missing `source`
|
| 473 |
+
- missing `primary_topic`
|
| 474 |
+
- missing `title` but present `display_name`
|
| 475 |
+
- missing `abstract_inverted_index`
|
| 476 |
+
|
| 477 |
+
This is not a large migration, just defensive cleanup.
|
| 478 |
+
|
| 479 |
+
### Phase 8: documentation and examples
|
| 480 |
+
|
| 481 |
+
Update the user-facing examples in [`app.py`](./app.py#L1265) so that new examples use current canonical OpenAlex URLs.
|
| 482 |
+
|
| 483 |
+
Important:
|
| 484 |
+
|
| 485 |
+
- still accept old URLs
|
| 486 |
+
- show new URLs in examples
|
| 487 |
+
|
| 488 |
+
That keeps the interface the same while steering users toward the current API surface.
|
| 489 |
+
|
| 490 |
+
## Proposed file-level changes
|
| 491 |
+
|
| 492 |
+
### `app.py`
|
| 493 |
+
|
| 494 |
+
Change:
|
| 495 |
+
|
| 496 |
+
- remove direct reliance on PyAlex query objects
|
| 497 |
+
- call shared client methods instead
|
| 498 |
+
- replace API auth setup
|
| 499 |
+
|
| 500 |
+
Keep:
|
| 501 |
+
|
| 502 |
+
- UI
|
| 503 |
+
- sampling controls
|
| 504 |
+
- plotting logic
|
| 505 |
+
- CSV export format
|
| 506 |
+
|
| 507 |
+
### `openalex_utils.py`
|
| 508 |
+
|
| 509 |
+
Keep:
|
| 510 |
+
|
| 511 |
+
- `invert_abstract`
|
| 512 |
+
- `process_records_to_df`
|
| 513 |
+
- `get_pub`
|
| 514 |
+
- `get_field`
|
| 515 |
+
- filename/readable-name helpers, but rewritten to use normalized queries
|
| 516 |
+
|
| 517 |
+
Remove or replace:
|
| 518 |
+
|
| 519 |
+
- `openalex_url_to_pyalex_query`
|
| 520 |
+
|
| 521 |
+
### `requirements.txt`
|
| 522 |
+
|
| 523 |
+
Change:
|
| 524 |
+
|
| 525 |
+
- pin `pyalex>=0.21,<0.22` if it remains
|
| 526 |
+
- otherwise remove `pyalex`
|
| 527 |
+
|
| 528 |
+
Keep:
|
| 529 |
+
|
| 530 |
+
- `requests`
|
| 531 |
+
|
| 532 |
+
Optional:
|
| 533 |
+
|
| 534 |
+
- add `httpx` only if you want async or more structured retry middleware; it is not necessary for this migration
|
| 535 |
+
|
| 536 |
+
## What should not change
|
| 537 |
+
|
| 538 |
+
To satisfy "keep the interface working as is", I would explicitly preserve:
|
| 539 |
+
|
| 540 |
+
- the textbox accepting pasted OpenAlex URLs
|
| 541 |
+
- semicolon-separated multi-query input
|
| 542 |
+
- the "Reduce Sample Size" flow
|
| 543 |
+
- the "First n samples" and "n random samples" options
|
| 544 |
+
- the CSV upload flow
|
| 545 |
+
- the downloadable CSV schema
|
| 546 |
+
- the use of `primary_topic` for field labels and coloring
|
| 547 |
+
|
| 548 |
+
## Edge cases to support
|
| 549 |
+
|
| 550 |
+
The migration should explicitly test and support:
|
| 551 |
+
|
| 552 |
+
- old website URLs using `default.search`
|
| 553 |
+
- old website URLs using `host_venue.id`
|
| 554 |
+
- old website URLs using `concepts.id`
|
| 555 |
+
- current URLs using `search=...`
|
| 556 |
+
- semicolon-separated multiple URLs
|
| 557 |
+
- DOI CSV uploads
|
| 558 |
+
- large queries over 10,000 results
|
| 559 |
+
- random samples over 10,000 requested records
|
| 560 |
+
- filters with OR values using `|`
|
| 561 |
+
- filters with year ranges
|
| 562 |
+
|
| 563 |
+
## Testing plan
|
| 564 |
+
|
| 565 |
+
### Unit tests
|
| 566 |
+
|
| 567 |
+
Add tests for normalization:
|
| 568 |
+
|
| 569 |
+
- `default.search` becomes canonical `search`
|
| 570 |
+
- `host_venue.id` rewrites to `primary_location.source.id`
|
| 571 |
+
- `alternate_host_venues.id` rewrites to `locations.source.id`
|
| 572 |
+
- `concepts.id` is preserved but flagged as legacy
|
| 573 |
+
- `per-page` becomes `per_page`
|
| 574 |
+
|
| 575 |
+
Add tests for record normalization:
|
| 576 |
+
|
| 577 |
+
- missing `title`
|
| 578 |
+
- missing `primary_location`
|
| 579 |
+
- missing `primary_topic`
|
| 580 |
+
- missing `abstract_inverted_index`
|
| 581 |
+
|
| 582 |
+
### Integration tests
|
| 583 |
+
|
| 584 |
+
Use mocked HTTP responses or recorded fixtures for:
|
| 585 |
+
|
| 586 |
+
- cursor pagination
|
| 587 |
+
- random sampling
|
| 588 |
+
- DOI batch resolution
|
| 589 |
+
- singleton lookups for readable labels
|
| 590 |
+
|
| 591 |
+
### Live smoke tests
|
| 592 |
+
|
| 593 |
+
Run a small set of real queries against OpenAlex:
|
| 594 |
+
|
| 595 |
+
- a search query
|
| 596 |
+
- an institution-year filter
|
| 597 |
+
- a citation query
|
| 598 |
+
- a legacy `host_venue.id` query that must now succeed via rewrite
|
| 599 |
+
|
| 600 |
+
## Rollout sequence
|
| 601 |
+
|
| 602 |
+
Recommended order:
|
| 603 |
+
|
| 604 |
+
1. Add the new client and auth handling.
|
| 605 |
+
2. Add URL normalization and tests.
|
| 606 |
+
3. Swap main fetch paths in `app.py`.
|
| 607 |
+
4. Swap DOI upload path.
|
| 608 |
+
5. Rewrite readable-name generation.
|
| 609 |
+
6. Update examples and docs.
|
| 610 |
+
7. Pin or remove PyAlex.
|
| 611 |
+
|
| 612 |
+
This minimizes risk because the UI and plotting code remain untouched until the data layer is stable.
|
| 613 |
+
|
| 614 |
+
## Risk assessment
|
| 615 |
+
|
| 616 |
+
### Low risk
|
| 617 |
+
|
| 618 |
+
- auth migration to API key
|
| 619 |
+
- cursor pagination
|
| 620 |
+
- field selection
|
| 621 |
+
- defensive record normalization
|
| 622 |
+
|
| 623 |
+
### Medium risk
|
| 624 |
+
|
| 625 |
+
- URL normalization, because old OpenAlex website URLs are part of the app's public contract
|
| 626 |
+
|
| 627 |
+
### High-risk area if handled incorrectly
|
| 628 |
+
|
| 629 |
+
- automatic conversion from Concepts to Topics
|
| 630 |
+
|
| 631 |
+
I would not promise perfect automatic Concept-to-Topic migration. Legacy Concept URLs should remain supported as long as OpenAlex still accepts them. If OpenAlex removes them later, that should become a clear user-facing compatibility warning, not a silent semantic rewrite.
|
| 632 |
+
|
| 633 |
+
## Final recommendation
|
| 634 |
+
|
| 635 |
+
Do not try to "patch" the current `openalex_url_to_pyalex_query()` approach into compliance.
|
| 636 |
+
|
| 637 |
+
That function is the wrong abstraction now. The app's input is an OpenAlex URL, and the safest way to preserve the current interface is:
|
| 638 |
+
|
| 639 |
+
- normalize that URL
|
| 640 |
+
- call the OpenAlex API directly
|
| 641 |
+
- keep the returned records in the same downstream shape
|
| 642 |
+
|
| 643 |
+
PyAlex 0.21 is still useful, but it should no longer define the repository's transport contract.
|
| 644 |
+
|
| 645 |
+
## Sources
|
| 646 |
+
|
| 647 |
+
- OpenAlex API overview: https://developers.openalex.org/api-reference/works/list-works
|
| 648 |
+
- OpenAlex works filters/search fields: https://developers.openalex.org/api-reference/works/list-works
|
| 649 |
+
- OpenAlex authentication and pricing: https://developers.openalex.org/getting-started/api-overview
|
| 650 |
+
- OpenAlex LLM quick reference: https://developers.openalex.org/api-guide-for-llms
|
| 651 |
+
- OpenAlex pagination guide: https://developers.openalex.org/how-to-use-the-api/get-lists-of-entities/page-through-results
|
| 652 |
+
- OpenAlex select-fields guide: https://developers.openalex.org/how-to-use-the-api/get-lists-of-entities/select-fields
|
| 653 |
+
- OpenAlex deprecations: https://developers.openalex.org/guides/deprecations
|
| 654 |
+
- PyAlex package metadata: https://pypi.org/project/pyalex/
|
| 655 |
+
- Live OpenAlex host_venue failure check: https://api.openalex.org/works?filter=host_venue.id:S125754415&per_page=1
|
| 656 |
+
- Live OpenAlex replacement filter check: https://api.openalex.org/works?filter=primary_location.source.id:S125754415&per_page=1
|
| 657 |
+
- Live OpenAlex pagination failure check: https://api.openalex.org/works?filter=publication_year:2024&page=101&per_page=100
|
app.py
CHANGED
|
@@ -5,6 +5,9 @@ print(f"Starting up: {time.strftime('%Y-%m-%d %H:%M:%S')}")
|
|
| 5 |
# Standard library imports
|
| 6 |
|
| 7 |
import os
|
|
|
|
|
|
|
|
|
|
| 8 |
|
| 9 |
#Enforce local cching:
|
| 10 |
|
|
@@ -78,8 +81,6 @@ import colormaps
|
|
| 78 |
import matplotlib.colors as mcolors
|
| 79 |
from matplotlib.colors import Normalize
|
| 80 |
|
| 81 |
-
import random
|
| 82 |
-
|
| 83 |
import opinionated # for fonts
|
| 84 |
plt.style.use("opinionated_rc")
|
| 85 |
|
|
@@ -159,11 +160,10 @@ def _get_token(request: gr.Request):
|
|
| 159 |
#print(f"Spaces version: {spaces.__version__}")
|
| 160 |
|
| 161 |
import datamapplot
|
| 162 |
-
import pyalex
|
| 163 |
|
| 164 |
# Local imports
|
|
|
|
| 165 |
from openalex_utils import (
|
| 166 |
-
openalex_url_to_pyalex_query,
|
| 167 |
get_field,
|
| 168 |
process_records_to_df,
|
| 169 |
openalex_url_to_filename,
|
|
@@ -195,7 +195,7 @@ except ImportError:
|
|
| 195 |
|
| 196 |
|
| 197 |
# Configure OpenAlex
|
| 198 |
-
|
| 199 |
|
| 200 |
print(f"Imports completed: {time.strftime('%Y-%m-%d %H:%M:%S')}")
|
| 201 |
|
|
@@ -466,209 +466,61 @@ def predict(request: gr.Request, text_input, sample_size_slider, reduce_sample_c
|
|
| 466 |
urls = [url.strip() for url in text_input.split(';')]
|
| 467 |
records = []
|
| 468 |
query_indices = [] # Track which query each record comes from
|
| 469 |
-
total_query_length = 0
|
| 470 |
-
expected_download_count = 0 # Track expected number of records to download for progress
|
| 471 |
|
| 472 |
# Use first URL for filename
|
| 473 |
-
first_query, first_params = openalex_url_to_pyalex_query(urls[0])
|
| 474 |
filename = openalex_url_to_filename(urls[0])
|
| 475 |
print(f"Filename: {filename}")
|
| 476 |
|
| 477 |
# Process each URL
|
| 478 |
-
|
| 479 |
-
|
| 480 |
-
|
| 481 |
-
|
| 482 |
-
|
| 483 |
-
|
| 484 |
-
|
| 485 |
-
|
| 486 |
-
|
| 487 |
-
|
| 488 |
-
|
| 489 |
-
|
| 490 |
-
|
| 491 |
-
|
| 492 |
-
|
| 493 |
-
|
| 494 |
-
|
| 495 |
-
|
| 496 |
-
|
| 497 |
-
|
| 498 |
-
|
| 499 |
-
|
| 500 |
-
|
| 501 |
-
|
| 502 |
-
|
| 503 |
-
|
| 504 |
-
|
| 505 |
-
|
| 506 |
-
|
| 507 |
-
|
| 508 |
-
|
| 509 |
-
|
| 510 |
-
|
| 511 |
-
|
| 512 |
-
|
| 513 |
-
|
| 514 |
-
|
| 515 |
-
|
| 516 |
-
batch_num = 1
|
| 517 |
-
|
| 518 |
-
print(f'Target size {target_size} > 10k, using batched sampling with batch size {batch_size}')
|
| 519 |
-
|
| 520 |
-
while remaining > 0 and len(sampled_records) < target_size:
|
| 521 |
-
current_batch_size = min(batch_size, remaining)
|
| 522 |
-
batch_seed = seed_int + batch_num # Different seed for each batch
|
| 523 |
-
|
| 524 |
-
print(f'Batch {batch_num}: requesting {current_batch_size} samples (seed={batch_seed})')
|
| 525 |
-
|
| 526 |
-
# Sample this batch
|
| 527 |
-
batch_query = query.sample(current_batch_size, seed=batch_seed)
|
| 528 |
-
|
| 529 |
-
batch_records = []
|
| 530 |
-
batch_count = 0
|
| 531 |
-
for page in batch_query.paginate(per_page=200, method='page', n_max=None):
|
| 532 |
-
for record in page:
|
| 533 |
-
# Check for duplicates using OpenAlex ID
|
| 534 |
-
record_id = record.get('id', '')
|
| 535 |
-
if record_id not in seen_ids:
|
| 536 |
-
seen_ids.add(record_id)
|
| 537 |
-
batch_records.append(record)
|
| 538 |
-
batch_count += 1
|
| 539 |
-
|
| 540 |
-
sampled_records.extend(batch_records)
|
| 541 |
-
remaining -= len(batch_records)
|
| 542 |
-
batch_num += 1
|
| 543 |
-
|
| 544 |
-
print(f'Batch {batch_num-1} complete: got {len(batch_records)} unique records ({len(sampled_records)}/{target_size} total)')
|
| 545 |
-
|
| 546 |
-
progress(0.1 + (0.15 * len(sampled_records) / target_size),
|
| 547 |
-
desc=f"Batched sampling from query {i+1}/{len(urls)}... ({len(sampled_records)}/{target_size})")
|
| 548 |
-
|
| 549 |
-
# Safety check to avoid infinite loops
|
| 550 |
-
if batch_num > 20: # Max 20 batches (should handle up to ~200k samples)
|
| 551 |
-
print("Warning: Maximum batch limit reached, stopping sampling")
|
| 552 |
-
break
|
| 553 |
-
else:
|
| 554 |
-
# Single batch sampling for <= 10k
|
| 555 |
-
sampled_query = query.sample(target_size, seed=seed_int)
|
| 556 |
-
|
| 557 |
-
records_count = 0
|
| 558 |
-
for page in sampled_query.paginate(per_page=200, method='page', n_max=None):
|
| 559 |
-
for record in page:
|
| 560 |
-
sampled_records.append(record)
|
| 561 |
-
records_count += 1
|
| 562 |
-
progress(0.1 + (0.15 * records_count / target_size),
|
| 563 |
-
desc=f"Getting sampled data from query {i+1}/{len(urls)}... ({records_count}/{target_size})")
|
| 564 |
-
|
| 565 |
-
print(f'PyAlex sampling successful: got {len(sampled_records)} records (requested {target_size})')
|
| 566 |
-
else:
|
| 567 |
-
raise AttributeError("sample method not available")
|
| 568 |
-
|
| 569 |
-
except Exception as e:
|
| 570 |
-
print(f"PyAlex sampling failed ({e}), using fallback method...")
|
| 571 |
-
|
| 572 |
-
# Fallback: get all records and sample manually
|
| 573 |
-
all_records = []
|
| 574 |
-
records_count = 0
|
| 575 |
-
|
| 576 |
-
# Use page pagination for fallback method
|
| 577 |
-
for page in query.paginate(per_page=200, method='page', n_max=None):
|
| 578 |
-
for record in page:
|
| 579 |
-
all_records.append(record)
|
| 580 |
-
records_count += 1
|
| 581 |
-
progress(0.1 + (0.15 * records_count / query_length),
|
| 582 |
-
desc=f"Downloading for sampling from query {i+1}/{len(urls)}...")
|
| 583 |
-
|
| 584 |
-
# Now sample manually
|
| 585 |
-
if len(all_records) > target_size:
|
| 586 |
-
import random
|
| 587 |
-
random.seed(seed_int)
|
| 588 |
-
sampled_records = random.sample(all_records, target_size)
|
| 589 |
-
else:
|
| 590 |
-
sampled_records = all_records
|
| 591 |
-
|
| 592 |
-
print(f'Fallback sampling: got {len(sampled_records)} from {len(all_records)} total')
|
| 593 |
-
|
| 594 |
-
# Add the sampled records
|
| 595 |
-
for idx, record in enumerate(sampled_records):
|
| 596 |
-
records.append(record)
|
| 597 |
-
query_indices.append(i)
|
| 598 |
-
# Safe progress calculation
|
| 599 |
-
if expected_download_count > 0:
|
| 600 |
-
progress_val = 0.1 + (0.2 * len(records) / expected_download_count)
|
| 601 |
-
else:
|
| 602 |
-
progress_val = 0.1
|
| 603 |
-
progress(progress_val, desc=f"Processing sampled data from query {i+1}/{len(urls)}...")
|
| 604 |
-
else:
|
| 605 |
-
# Keep existing logic for "First n samples" and "All"
|
| 606 |
-
target_size = sample_size_slider if reduce_sample_checkbox and sample_reduction_method == "First n samples" else query_length
|
| 607 |
-
records_per_query = 0
|
| 608 |
-
|
| 609 |
-
print(f"Query {i+1}: target_size={target_size}, query_length={query_length}, method={sample_reduction_method}")
|
| 610 |
-
|
| 611 |
-
should_break_current_query = False
|
| 612 |
-
# For "First n samples", limit the maximum records fetched to avoid over-downloading
|
| 613 |
-
max_records_to_fetch = target_size if reduce_sample_checkbox and sample_reduction_method == "First n samples" else None
|
| 614 |
-
for page in query.paginate(per_page=200, method='page', n_max=max_records_to_fetch):
|
| 615 |
-
# Add retry mechanism for processing each page
|
| 616 |
-
max_retries = 5
|
| 617 |
-
base_wait_time = 1 # Starting wait time in seconds
|
| 618 |
-
exponent = 1.5 # Exponential factor
|
| 619 |
-
|
| 620 |
-
for retry_attempt in range(max_retries):
|
| 621 |
-
try:
|
| 622 |
-
for record in page:
|
| 623 |
-
# Safety check: don't process if we've already reached target
|
| 624 |
-
if reduce_sample_checkbox and sample_reduction_method == "First n samples" and records_per_query >= target_size:
|
| 625 |
-
print(f"Reached target size before processing: {records_per_query}/{target_size}, breaking from download")
|
| 626 |
-
should_break_current_query = True
|
| 627 |
-
break
|
| 628 |
-
|
| 629 |
-
records.append(record)
|
| 630 |
-
query_indices.append(i) # Track which query this record comes from
|
| 631 |
-
records_per_query += 1
|
| 632 |
-
# Safe progress calculation
|
| 633 |
-
if expected_download_count > 0:
|
| 634 |
-
progress_val = 0.1 + (0.2 * len(records) / expected_download_count)
|
| 635 |
-
else:
|
| 636 |
-
progress_val = 0.1
|
| 637 |
-
progress(progress_val, desc=f"Getting data from query {i+1}/{len(urls)}...")
|
| 638 |
-
|
| 639 |
-
if reduce_sample_checkbox and sample_reduction_method == "First n samples" and records_per_query >= target_size:
|
| 640 |
-
print(f"Reached target size: {records_per_query}/{target_size}, breaking from download")
|
| 641 |
-
should_break_current_query = True
|
| 642 |
-
break
|
| 643 |
-
# If we get here without an exception, break the retry loop
|
| 644 |
-
break
|
| 645 |
-
except Exception as e:
|
| 646 |
-
print(f"Error processing page: {e}")
|
| 647 |
-
if retry_attempt < max_retries - 1:
|
| 648 |
-
wait_time = base_wait_time * (exponent ** retry_attempt) + random.random()
|
| 649 |
-
print(f"Retrying in {wait_time:.2f} seconds (attempt {retry_attempt + 1}/{max_retries})...")
|
| 650 |
-
time.sleep(wait_time)
|
| 651 |
-
else:
|
| 652 |
-
print(f"Maximum retries reached. Continuing with next page.")
|
| 653 |
-
|
| 654 |
-
# Break out of retry loop if we've reached target
|
| 655 |
-
if should_break_current_query:
|
| 656 |
-
break
|
| 657 |
-
|
| 658 |
-
if should_break_current_query:
|
| 659 |
-
print(f"Successfully downloaded target size for query {i+1}, moving to next query")
|
| 660 |
-
# Continue to next query instead of breaking the entire query loop
|
| 661 |
-
continue
|
| 662 |
-
# Continue to next query - don't break out of the main query loop
|
| 663 |
print(f"Query completed in {time.time() - start_time:.2f} seconds")
|
| 664 |
print(f"Total records collected: {len(records)}")
|
| 665 |
-
print(f"Expected to download: {expected_download_count}")
|
| 666 |
-
print(f"Available from all queries: {total_query_length}")
|
| 667 |
print(f"Sample method used: {sample_reduction_method}")
|
| 668 |
print(f"Reduce sample enabled: {reduce_sample_checkbox}")
|
| 669 |
if sample_reduction_method == "n random samples":
|
| 670 |
print(f"Seed value: {seed_value}")
|
| 671 |
|
|
|
|
|
|
|
|
|
|
|
|
|
| 672 |
# Process records
|
| 673 |
processing_start = time.time()
|
| 674 |
records_df = process_records_to_df(records)
|
|
@@ -678,7 +530,7 @@ def predict(request: gr.Request, text_input, sample_size_slider, reduce_sample_c
|
|
| 678 |
|
| 679 |
|
| 680 |
if reduce_sample_checkbox and sample_reduction_method != "All" and sample_reduction_method != "n random samples":
|
| 681 |
-
#
|
| 682 |
sample_size = min(sample_size_slider, len(records_df))
|
| 683 |
|
| 684 |
# Check if we have multiple queries for sampling logic
|
|
|
|
| 5 |
# Standard library imports
|
| 6 |
|
| 7 |
import os
|
| 8 |
+
from config_loader import load_local_config
|
| 9 |
+
|
| 10 |
+
load_local_config()
|
| 11 |
|
| 12 |
#Enforce local cching:
|
| 13 |
|
|
|
|
| 81 |
import matplotlib.colors as mcolors
|
| 82 |
from matplotlib.colors import Normalize
|
| 83 |
|
|
|
|
|
|
|
| 84 |
import opinionated # for fonts
|
| 85 |
plt.style.use("opinionated_rc")
|
| 86 |
|
|
|
|
| 160 |
#print(f"Spaces version: {spaces.__version__}")
|
| 161 |
|
| 162 |
import datamapplot
|
|
|
|
| 163 |
|
| 164 |
# Local imports
|
| 165 |
+
from openalex_client import get_openalex_client, normalize_openalex_url
|
| 166 |
from openalex_utils import (
|
|
|
|
| 167 |
get_field,
|
| 168 |
process_records_to_df,
|
| 169 |
openalex_url_to_filename,
|
|
|
|
| 195 |
|
| 196 |
|
| 197 |
# Configure OpenAlex
|
| 198 |
+
openalex_client = get_openalex_client(require_api_key=is_running_in_hf_space())
|
| 199 |
|
| 200 |
print(f"Imports completed: {time.strftime('%Y-%m-%d %H:%M:%S')}")
|
| 201 |
|
|
|
|
| 466 |
urls = [url.strip() for url in text_input.split(';')]
|
| 467 |
records = []
|
| 468 |
query_indices = [] # Track which query each record comes from
|
|
|
|
|
|
|
| 469 |
|
| 470 |
# Use first URL for filename
|
|
|
|
| 471 |
filename = openalex_url_to_filename(urls[0])
|
| 472 |
print(f"Filename: {filename}")
|
| 473 |
|
| 474 |
# Process each URL
|
| 475 |
+
try:
|
| 476 |
+
for i, url in enumerate(urls):
|
| 477 |
+
query = normalize_openalex_url(url)
|
| 478 |
+
if query.entity != "works":
|
| 479 |
+
raise ValueError("Only OpenAlex work queries are supported.")
|
| 480 |
+
|
| 481 |
+
progress_base = 0.1 + (0.2 * i / max(1, len(urls)))
|
| 482 |
+
|
| 483 |
+
if reduce_sample_checkbox and sample_reduction_method == "n random samples":
|
| 484 |
+
try:
|
| 485 |
+
seed_int = int(seed_value) if seed_value.strip() else 42
|
| 486 |
+
except ValueError:
|
| 487 |
+
seed_int = 42
|
| 488 |
+
print(f"Invalid seed value '{seed_value}', using default: 42")
|
| 489 |
+
|
| 490 |
+
progress(progress_base, desc=f"Sampling query {i+1}/{len(urls)}...")
|
| 491 |
+
query_records = openalex_client.fetch_sampled_works(
|
| 492 |
+
query,
|
| 493 |
+
sample_size=sample_size_slider,
|
| 494 |
+
seed=seed_int,
|
| 495 |
+
)
|
| 496 |
+
print(f"Query {i+1}: sampled {len(query_records)} records (seed={seed_int})")
|
| 497 |
+
else:
|
| 498 |
+
target_size = sample_size_slider if reduce_sample_checkbox and sample_reduction_method == "First n samples" else None
|
| 499 |
+
query_desc = f"Downloading query {i+1}/{len(urls)}..."
|
| 500 |
+
if target_size is not None:
|
| 501 |
+
query_desc = f"Downloading first {target_size} records from query {i+1}/{len(urls)}..."
|
| 502 |
+
progress(progress_base, desc=query_desc)
|
| 503 |
+
query_records = openalex_client.fetch_works(query, limit=target_size)
|
| 504 |
+
print(f"Query {i+1}: fetched {len(query_records)} records")
|
| 505 |
+
|
| 506 |
+
records.extend(query_records)
|
| 507 |
+
query_indices.extend([i] * len(query_records))
|
| 508 |
+
progress(0.1 + (0.2 * (i + 1) / max(1, len(urls))), desc=f"Finished query {i+1}/{len(urls)}")
|
| 509 |
+
except Exception as e:
|
| 510 |
+
error_message = f"Error downloading data from OpenAlex: {str(e)}"
|
| 511 |
+
return create_error_response(error_message)
|
| 512 |
+
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 513 |
print(f"Query completed in {time.time() - start_time:.2f} seconds")
|
| 514 |
print(f"Total records collected: {len(records)}")
|
|
|
|
|
|
|
| 515 |
print(f"Sample method used: {sample_reduction_method}")
|
| 516 |
print(f"Reduce sample enabled: {reduce_sample_checkbox}")
|
| 517 |
if sample_reduction_method == "n random samples":
|
| 518 |
print(f"Seed value: {seed_value}")
|
| 519 |
|
| 520 |
+
if not records:
|
| 521 |
+
error_message = "Error: OpenAlex returned no records for the provided query."
|
| 522 |
+
return create_error_response(error_message)
|
| 523 |
+
|
| 524 |
# Process records
|
| 525 |
processing_start = time.time()
|
| 526 |
records_df = process_records_to_df(records)
|
|
|
|
| 530 |
|
| 531 |
|
| 532 |
if reduce_sample_checkbox and sample_reduction_method != "All" and sample_reduction_method != "n random samples":
|
| 533 |
+
# Random sampling is already handled in the OpenAlex fetch layer above.
|
| 534 |
sample_size = min(sample_size_slider, len(records_df))
|
| 535 |
|
| 536 |
# Check if we have multiple queries for sampling logic
|
config_loader.py
ADDED
|
@@ -0,0 +1,34 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import json
|
| 2 |
+
import os
|
| 3 |
+
from functools import lru_cache
|
| 4 |
+
from pathlib import Path
|
| 5 |
+
|
| 6 |
+
|
| 7 |
+
DEFAULT_CONFIG_FILES = (
|
| 8 |
+
"openalex_config.local.json",
|
| 9 |
+
)
|
| 10 |
+
|
| 11 |
+
|
| 12 |
+
@lru_cache(maxsize=1)
|
| 13 |
+
def load_local_config():
|
| 14 |
+
"""Load local config values without overriding existing environment variables."""
|
| 15 |
+
config_file = os.environ.get("OPENALEX_CONFIG_FILE")
|
| 16 |
+
candidate_paths = [Path(config_file)] if config_file else [Path(name) for name in DEFAULT_CONFIG_FILES]
|
| 17 |
+
|
| 18 |
+
loaded = {}
|
| 19 |
+
for path in candidate_paths:
|
| 20 |
+
if not path.exists():
|
| 21 |
+
continue
|
| 22 |
+
|
| 23 |
+
with path.open("r", encoding="utf-8") as handle:
|
| 24 |
+
loaded = json.load(handle)
|
| 25 |
+
|
| 26 |
+
if not isinstance(loaded, dict):
|
| 27 |
+
raise ValueError(f"Config file {path} must contain a JSON object at the top level.")
|
| 28 |
+
|
| 29 |
+
for key, value in loaded.items():
|
| 30 |
+
if key not in os.environ and value is not None:
|
| 31 |
+
os.environ[key] = str(value)
|
| 32 |
+
break
|
| 33 |
+
|
| 34 |
+
return loaded
|
openalex_client.py
ADDED
|
@@ -0,0 +1,369 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import os
|
| 2 |
+
import random
|
| 3 |
+
import re
|
| 4 |
+
import time
|
| 5 |
+
from dataclasses import dataclass, field
|
| 6 |
+
from functools import lru_cache
|
| 7 |
+
from typing import Iterable
|
| 8 |
+
from urllib.parse import parse_qs, urlparse
|
| 9 |
+
|
| 10 |
+
import requests
|
| 11 |
+
|
| 12 |
+
from config_loader import load_local_config
|
| 13 |
+
|
| 14 |
+
|
| 15 |
+
DEFAULT_BASE_URL = "https://api.openalex.org"
|
| 16 |
+
DEFAULT_PER_PAGE = 100
|
| 17 |
+
DEFAULT_TIMEOUT = 30
|
| 18 |
+
DEFAULT_RETRIES = 5
|
| 19 |
+
DEFAULT_SELECT_FIELDS = (
|
| 20 |
+
"id",
|
| 21 |
+
"title",
|
| 22 |
+
"display_name",
|
| 23 |
+
"doi",
|
| 24 |
+
"publication_year",
|
| 25 |
+
"abstract_inverted_index",
|
| 26 |
+
"primary_location",
|
| 27 |
+
"primary_topic",
|
| 28 |
+
"referenced_works",
|
| 29 |
+
)
|
| 30 |
+
IGNORED_INPUT_PARAMS = {
|
| 31 |
+
"api_key",
|
| 32 |
+
"cursor",
|
| 33 |
+
"page",
|
| 34 |
+
"per-page",
|
| 35 |
+
"per_page",
|
| 36 |
+
"sample",
|
| 37 |
+
"seed",
|
| 38 |
+
"select",
|
| 39 |
+
}
|
| 40 |
+
OPENALEX_ID_RE = re.compile(r"^[A-Za-z]\d+$")
|
| 41 |
+
|
| 42 |
+
|
| 43 |
+
@dataclass(frozen=True)
|
| 44 |
+
class FilterToken:
|
| 45 |
+
key: str
|
| 46 |
+
value: str
|
| 47 |
+
|
| 48 |
+
|
| 49 |
+
@dataclass
|
| 50 |
+
class OpenAlexQuery:
|
| 51 |
+
entity: str = "works"
|
| 52 |
+
params: dict[str, str] = field(default_factory=dict)
|
| 53 |
+
filter_tokens: list[FilterToken] = field(default_factory=list)
|
| 54 |
+
legacy_filters: list[str] = field(default_factory=list)
|
| 55 |
+
|
| 56 |
+
def as_params(self, select_fields: Iterable[str] | None = None, extra_params: dict[str, str] | None = None):
|
| 57 |
+
params = dict(self.params)
|
| 58 |
+
if self.filter_tokens:
|
| 59 |
+
params["filter"] = ",".join(f"{token.key}:{token.value}" for token in self.filter_tokens)
|
| 60 |
+
if select_fields:
|
| 61 |
+
params["select"] = ",".join(select_fields)
|
| 62 |
+
if extra_params:
|
| 63 |
+
params.update({key: str(value) for key, value in extra_params.items()})
|
| 64 |
+
return params
|
| 65 |
+
|
| 66 |
+
def without_params(self, *keys: str):
|
| 67 |
+
keys_to_remove = set(keys)
|
| 68 |
+
return OpenAlexQuery(
|
| 69 |
+
entity=self.entity,
|
| 70 |
+
params={key: value for key, value in self.params.items() if key not in keys_to_remove},
|
| 71 |
+
filter_tokens=list(self.filter_tokens),
|
| 72 |
+
legacy_filters=list(self.legacy_filters),
|
| 73 |
+
)
|
| 74 |
+
|
| 75 |
+
|
| 76 |
+
def _normalize_url_input(url: str):
|
| 77 |
+
url = url.strip()
|
| 78 |
+
if "://" not in url and url.startswith(("openalex.org/", "api.openalex.org/")):
|
| 79 |
+
return f"https://{url}"
|
| 80 |
+
return url
|
| 81 |
+
|
| 82 |
+
|
| 83 |
+
def _split_filter_string(filter_value: str):
|
| 84 |
+
tokens = []
|
| 85 |
+
current = []
|
| 86 |
+
quote = None
|
| 87 |
+
|
| 88 |
+
for char in filter_value:
|
| 89 |
+
if char in {"'", '"'}:
|
| 90 |
+
if quote == char:
|
| 91 |
+
quote = None
|
| 92 |
+
elif quote is None:
|
| 93 |
+
quote = char
|
| 94 |
+
current.append(char)
|
| 95 |
+
continue
|
| 96 |
+
|
| 97 |
+
if char == "," and quote is None:
|
| 98 |
+
token = "".join(current).strip()
|
| 99 |
+
if token:
|
| 100 |
+
tokens.append(token)
|
| 101 |
+
current = []
|
| 102 |
+
continue
|
| 103 |
+
|
| 104 |
+
current.append(char)
|
| 105 |
+
|
| 106 |
+
token = "".join(current).strip()
|
| 107 |
+
if token:
|
| 108 |
+
tokens.append(token)
|
| 109 |
+
return tokens
|
| 110 |
+
|
| 111 |
+
|
| 112 |
+
def _normalize_filter_token(key: str, value: str):
|
| 113 |
+
legacy_key = None
|
| 114 |
+
|
| 115 |
+
if key == "default.search":
|
| 116 |
+
return "__search__", value, key
|
| 117 |
+
|
| 118 |
+
if key == "host_venue.id":
|
| 119 |
+
return "primary_location.source.id", value, key
|
| 120 |
+
|
| 121 |
+
if key.startswith("host_venue."):
|
| 122 |
+
return key.replace("host_venue.", "primary_location.source.", 1), value, key
|
| 123 |
+
|
| 124 |
+
if key == "alternate_host_venues.id":
|
| 125 |
+
return "locations.source.id", value, key
|
| 126 |
+
|
| 127 |
+
if key.startswith("alternate_host_venues."):
|
| 128 |
+
return key.replace("alternate_host_venues.", "locations.source.", 1), value, key
|
| 129 |
+
|
| 130 |
+
if key.startswith("x_concepts"):
|
| 131 |
+
return key.replace("x_concepts", "concepts", 1), value, key
|
| 132 |
+
|
| 133 |
+
return key, value, legacy_key
|
| 134 |
+
|
| 135 |
+
|
| 136 |
+
def normalize_openalex_url(url: str):
|
| 137 |
+
url = _normalize_url_input(url)
|
| 138 |
+
parsed_url = urlparse(url)
|
| 139 |
+
path_parts = [part for part in parsed_url.path.split("/") if part]
|
| 140 |
+
entity = path_parts[0] if path_parts else "works"
|
| 141 |
+
|
| 142 |
+
query_params = parse_qs(parsed_url.query, keep_blank_values=True)
|
| 143 |
+
|
| 144 |
+
params = {}
|
| 145 |
+
legacy_filters = []
|
| 146 |
+
filter_tokens = []
|
| 147 |
+
search_value = query_params.get("search", [None])[0]
|
| 148 |
+
|
| 149 |
+
for raw_filter in query_params.get("filter", []):
|
| 150 |
+
for token in _split_filter_string(raw_filter):
|
| 151 |
+
key, sep, value = token.partition(":")
|
| 152 |
+
if not sep:
|
| 153 |
+
continue
|
| 154 |
+
|
| 155 |
+
key = key.strip()
|
| 156 |
+
value = value.strip()
|
| 157 |
+
key, value, legacy_key = _normalize_filter_token(key, value)
|
| 158 |
+
if legacy_key:
|
| 159 |
+
legacy_filters.append(legacy_key)
|
| 160 |
+
|
| 161 |
+
if key == "__search__":
|
| 162 |
+
if search_value is None:
|
| 163 |
+
search_value = value
|
| 164 |
+
else:
|
| 165 |
+
filter_tokens.append(FilterToken("default.search", value))
|
| 166 |
+
continue
|
| 167 |
+
|
| 168 |
+
filter_tokens.append(FilterToken(key, value))
|
| 169 |
+
|
| 170 |
+
if search_value:
|
| 171 |
+
params["search"] = search_value
|
| 172 |
+
|
| 173 |
+
for key, values in query_params.items():
|
| 174 |
+
if not values or key in {"filter", "search"} or key in IGNORED_INPUT_PARAMS:
|
| 175 |
+
continue
|
| 176 |
+
normalized_key = "per_page" if key == "per-page" else key
|
| 177 |
+
params[normalized_key] = values[0]
|
| 178 |
+
|
| 179 |
+
return OpenAlexQuery(
|
| 180 |
+
entity=entity,
|
| 181 |
+
params=params,
|
| 182 |
+
filter_tokens=filter_tokens,
|
| 183 |
+
legacy_filters=legacy_filters,
|
| 184 |
+
)
|
| 185 |
+
|
| 186 |
+
|
| 187 |
+
def _normalize_openalex_id(entity_id: str):
|
| 188 |
+
entity_id = entity_id.strip()
|
| 189 |
+
if entity_id.startswith("https://openalex.org/"):
|
| 190 |
+
entity_id = entity_id.rstrip("/").split("/")[-1]
|
| 191 |
+
if OPENALEX_ID_RE.match(entity_id):
|
| 192 |
+
return entity_id[0].upper() + entity_id[1:]
|
| 193 |
+
return entity_id
|
| 194 |
+
|
| 195 |
+
|
| 196 |
+
class OpenAlexClient:
|
| 197 |
+
def __init__(
|
| 198 |
+
self,
|
| 199 |
+
api_key=None,
|
| 200 |
+
base_url=DEFAULT_BASE_URL,
|
| 201 |
+
timeout=DEFAULT_TIMEOUT,
|
| 202 |
+
max_retries=DEFAULT_RETRIES,
|
| 203 |
+
):
|
| 204 |
+
self.api_key = api_key
|
| 205 |
+
self.base_url = base_url.rstrip("/")
|
| 206 |
+
self.timeout = timeout
|
| 207 |
+
self.max_retries = max_retries
|
| 208 |
+
self.session = requests.Session()
|
| 209 |
+
self.session.headers.update(
|
| 210 |
+
{
|
| 211 |
+
"User-Agent": "OpenAlexMapper/1.0 (+https://huggingface.co/spaces/m7n/openalex_mapper)",
|
| 212 |
+
"Accept": "application/json",
|
| 213 |
+
}
|
| 214 |
+
)
|
| 215 |
+
self._entity_cache = {}
|
| 216 |
+
|
| 217 |
+
@classmethod
|
| 218 |
+
def from_env(cls, require_api_key=False):
|
| 219 |
+
load_local_config()
|
| 220 |
+
api_key = os.environ.get("OPENALEX_API_KEY")
|
| 221 |
+
if api_key:
|
| 222 |
+
api_key = api_key.strip()
|
| 223 |
+
if require_api_key and not api_key:
|
| 224 |
+
raise RuntimeError(
|
| 225 |
+
"OPENALEX_API_KEY is required. Set it as a Hugging Face Space secret or in openalex_config.local.json."
|
| 226 |
+
)
|
| 227 |
+
return cls(api_key=api_key or None)
|
| 228 |
+
|
| 229 |
+
def _request_json(self, path, params=None):
|
| 230 |
+
params = dict(params or {})
|
| 231 |
+
if self.api_key:
|
| 232 |
+
params["api_key"] = self.api_key
|
| 233 |
+
|
| 234 |
+
url = f"{self.base_url}/{path.lstrip('/')}"
|
| 235 |
+
last_error = None
|
| 236 |
+
|
| 237 |
+
for attempt in range(self.max_retries):
|
| 238 |
+
response = None
|
| 239 |
+
try:
|
| 240 |
+
response = self.session.get(url, params=params, timeout=self.timeout)
|
| 241 |
+
if response.status_code in {429, 500, 502, 503, 504}:
|
| 242 |
+
retry_after = response.headers.get("Retry-After")
|
| 243 |
+
wait_time = float(retry_after) if retry_after else (2 ** attempt)
|
| 244 |
+
time.sleep(wait_time)
|
| 245 |
+
continue
|
| 246 |
+
response.raise_for_status()
|
| 247 |
+
return response.json()
|
| 248 |
+
except requests.RequestException as exc:
|
| 249 |
+
last_error = exc
|
| 250 |
+
if attempt == self.max_retries - 1:
|
| 251 |
+
break
|
| 252 |
+
time.sleep(2 ** attempt)
|
| 253 |
+
|
| 254 |
+
if response is not None:
|
| 255 |
+
try:
|
| 256 |
+
payload = response.json()
|
| 257 |
+
except ValueError:
|
| 258 |
+
payload = response.text
|
| 259 |
+
raise RuntimeError(f"OpenAlex request failed for {url}: {payload}") from last_error
|
| 260 |
+
raise RuntimeError(f"OpenAlex request failed for {url}: {last_error}") from last_error
|
| 261 |
+
|
| 262 |
+
def get_entity(self, entity, entity_id, select_fields=None):
|
| 263 |
+
normalized_id = _normalize_openalex_id(entity_id)
|
| 264 |
+
cache_key = (entity, normalized_id, tuple(select_fields or ()))
|
| 265 |
+
if cache_key in self._entity_cache:
|
| 266 |
+
return self._entity_cache[cache_key]
|
| 267 |
+
|
| 268 |
+
payload = self._request_json(
|
| 269 |
+
f"{entity}/{normalized_id}",
|
| 270 |
+
params={"select": ",".join(select_fields)} if select_fields else None,
|
| 271 |
+
)
|
| 272 |
+
self._entity_cache[cache_key] = payload
|
| 273 |
+
return payload
|
| 274 |
+
|
| 275 |
+
def count(self, query):
|
| 276 |
+
payload = self._request_json(
|
| 277 |
+
query.entity,
|
| 278 |
+
params=query.as_params(select_fields=("id",), extra_params={"per_page": 1}),
|
| 279 |
+
)
|
| 280 |
+
return int(payload.get("meta", {}).get("count") or 0)
|
| 281 |
+
|
| 282 |
+
def _normalize_work_record(self, record):
|
| 283 |
+
normalized = dict(record)
|
| 284 |
+
normalized["title"] = normalized.get("title") or normalized.get("display_name") or " "
|
| 285 |
+
normalized.setdefault("abstract_inverted_index", None)
|
| 286 |
+
normalized.setdefault("primary_location", None)
|
| 287 |
+
normalized.setdefault("primary_topic", None)
|
| 288 |
+
normalized.setdefault("referenced_works", [])
|
| 289 |
+
return normalized
|
| 290 |
+
|
| 291 |
+
def iter_works(self, query, limit=None, extra_params=None, per_page=DEFAULT_PER_PAGE):
|
| 292 |
+
params = query.as_params(select_fields=DEFAULT_SELECT_FIELDS, extra_params=extra_params)
|
| 293 |
+
params["cursor"] = params.get("cursor", "*")
|
| 294 |
+
|
| 295 |
+
fetched = 0
|
| 296 |
+
while True:
|
| 297 |
+
current_per_page = per_page
|
| 298 |
+
if limit is not None:
|
| 299 |
+
remaining = limit - fetched
|
| 300 |
+
if remaining <= 0:
|
| 301 |
+
break
|
| 302 |
+
current_per_page = min(current_per_page, remaining)
|
| 303 |
+
params["per_page"] = current_per_page
|
| 304 |
+
|
| 305 |
+
payload = self._request_json(query.entity, params=params)
|
| 306 |
+
results = payload.get("results", [])
|
| 307 |
+
if not results:
|
| 308 |
+
break
|
| 309 |
+
|
| 310 |
+
for record in results:
|
| 311 |
+
yield self._normalize_work_record(record)
|
| 312 |
+
fetched += 1
|
| 313 |
+
if limit is not None and fetched >= limit:
|
| 314 |
+
return
|
| 315 |
+
|
| 316 |
+
next_cursor = payload.get("meta", {}).get("next_cursor")
|
| 317 |
+
if next_cursor is None:
|
| 318 |
+
break
|
| 319 |
+
params["cursor"] = next_cursor
|
| 320 |
+
|
| 321 |
+
def fetch_works(self, query, limit=None):
|
| 322 |
+
return list(self.iter_works(query, limit=limit))
|
| 323 |
+
|
| 324 |
+
def fetch_sampled_works(self, query, sample_size, seed):
|
| 325 |
+
sampling_query = query.without_params("sort")
|
| 326 |
+
if sample_size <= 10000:
|
| 327 |
+
return list(
|
| 328 |
+
self.iter_works(
|
| 329 |
+
sampling_query,
|
| 330 |
+
limit=sample_size,
|
| 331 |
+
extra_params={"sample": sample_size, "seed": seed},
|
| 332 |
+
)
|
| 333 |
+
)
|
| 334 |
+
return self.reservoir_sample_works(sampling_query, sample_size, seed)
|
| 335 |
+
|
| 336 |
+
def reservoir_sample_works(self, query, sample_size, seed):
|
| 337 |
+
rng = random.Random(seed)
|
| 338 |
+
reservoir = []
|
| 339 |
+
|
| 340 |
+
for index, record in enumerate(self.iter_works(query)):
|
| 341 |
+
if index < sample_size:
|
| 342 |
+
reservoir.append(record)
|
| 343 |
+
continue
|
| 344 |
+
|
| 345 |
+
sample_index = rng.randint(0, index)
|
| 346 |
+
if sample_index < sample_size:
|
| 347 |
+
reservoir[sample_index] = record
|
| 348 |
+
|
| 349 |
+
return reservoir
|
| 350 |
+
|
| 351 |
+
def fetch_records_from_dois(self, doi_list, block_size=50):
|
| 352 |
+
all_records = []
|
| 353 |
+
clean_dois = [doi.strip() for doi in doi_list if isinstance(doi, str) and doi.strip()]
|
| 354 |
+
|
| 355 |
+
for start in range(0, len(clean_dois), block_size):
|
| 356 |
+
sublist = clean_dois[start : start + block_size]
|
| 357 |
+
doi_filter = "|".join(sublist)
|
| 358 |
+
query = OpenAlexQuery(
|
| 359 |
+
entity="works",
|
| 360 |
+
filter_tokens=[FilterToken("doi", doi_filter)],
|
| 361 |
+
)
|
| 362 |
+
all_records.extend(self.fetch_works(query, limit=len(sublist)))
|
| 363 |
+
|
| 364 |
+
return all_records
|
| 365 |
+
|
| 366 |
+
|
| 367 |
+
@lru_cache(maxsize=1)
|
| 368 |
+
def get_openalex_client(require_api_key=False):
|
| 369 |
+
return OpenAlexClient.from_env(require_api_key=require_api_key)
|
openalex_config.example.json
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"OPENALEX_API_KEY": "replace-with-your-openalex-api-key"
|
| 3 |
+
}
|
openalex_utils.py
CHANGED
|
@@ -1,387 +1,232 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
import numpy as np
|
| 2 |
-
from urllib.parse import urlparse, parse_qs
|
| 3 |
-
from pyalex import Works, Authors, Institutions
|
| 4 |
import pandas as pd
|
| 5 |
-
import ast, json
|
| 6 |
-
|
| 7 |
-
def openalex_url_to_pyalex_query(url):
|
| 8 |
-
"""
|
| 9 |
-
Convert an OpenAlex search URL to a pyalex query.
|
| 10 |
-
|
| 11 |
-
Args:
|
| 12 |
-
url (str): The OpenAlex search URL.
|
| 13 |
-
|
| 14 |
-
Returns:
|
| 15 |
-
tuple: (Works object, dict of parameters)
|
| 16 |
-
"""
|
| 17 |
-
parsed_url = urlparse(url)
|
| 18 |
-
query_params = parse_qs(parsed_url.query)
|
| 19 |
-
|
| 20 |
-
# Initialize the Works object
|
| 21 |
-
query = Works()
|
| 22 |
-
|
| 23 |
-
# Handle filters
|
| 24 |
-
if 'filter' in query_params:
|
| 25 |
-
filters = query_params['filter'][0].split(',')
|
| 26 |
-
for f in filters:
|
| 27 |
-
if ':' in f:
|
| 28 |
-
key, value = f.split(':', 1)
|
| 29 |
-
if key == 'default.search':
|
| 30 |
-
query = query.search(value)
|
| 31 |
-
else:
|
| 32 |
-
query = query.filter(**{key: value})
|
| 33 |
-
|
| 34 |
-
# Handle sort - Fixed to properly handle field:direction format
|
| 35 |
-
if 'sort' in query_params:
|
| 36 |
-
sort_params = query_params['sort'][0].split(',')
|
| 37 |
-
for s in sort_params:
|
| 38 |
-
if ':' in s: # Handle field:direction format
|
| 39 |
-
field, direction = s.split(':')
|
| 40 |
-
query = query.sort(**{field: direction})
|
| 41 |
-
elif s.startswith('-'): # Handle -field format
|
| 42 |
-
query = query.sort(**{s[1:]: 'desc'})
|
| 43 |
-
else: # Handle field format
|
| 44 |
-
query = query.sort(**{s: 'asc'})
|
| 45 |
-
|
| 46 |
-
# Handle other parameters
|
| 47 |
-
params = {}
|
| 48 |
-
for key in ['page', 'per-page', 'sample', 'seed']:
|
| 49 |
-
if key in query_params:
|
| 50 |
-
params[key] = query_params[key][0]
|
| 51 |
-
|
| 52 |
-
return query, params
|
| 53 |
|
| 54 |
-
|
| 55 |
-
|
| 56 |
|
| 57 |
-
|
| 58 |
-
"""
|
| 59 |
-
# Try to coerce a string into a Python object first
|
| 60 |
if isinstance(inv_index, str):
|
| 61 |
try:
|
| 62 |
-
inv_index = json.loads(inv_index)
|
| 63 |
except Exception:
|
| 64 |
try:
|
| 65 |
-
inv_index = ast.literal_eval(inv_index)
|
| 66 |
except Exception:
|
| 67 |
inv_index = None
|
| 68 |
|
| 69 |
if isinstance(inv_index, dict):
|
| 70 |
-
|
| 71 |
-
return " ".join(
|
| 72 |
-
|
| 73 |
-
|
| 74 |
-
|
| 75 |
-
|
| 76 |
def get_pub(x):
|
| 77 |
"""Extract publication name from record."""
|
| 78 |
-
try:
|
| 79 |
-
source = x[
|
| 80 |
-
if source not in [
|
| 81 |
return source
|
| 82 |
-
|
| 83 |
-
|
| 84 |
-
|
| 85 |
-
|
| 86 |
|
| 87 |
def get_field(x):
|
| 88 |
"""Extract academic field from record."""
|
| 89 |
try:
|
| 90 |
-
field = x[
|
| 91 |
if field is not None:
|
| 92 |
return field
|
| 93 |
-
|
| 94 |
-
|
| 95 |
-
except:
|
| 96 |
return np.nan
|
| 97 |
|
|
|
|
| 98 |
def process_records_to_df(records):
|
| 99 |
-
"""
|
| 100 |
-
Convert OpenAlex records to a pandas DataFrame with processed fields.
|
| 101 |
-
Can handle either raw OpenAlex records or an existing DataFrame.
|
| 102 |
-
|
| 103 |
-
Args:
|
| 104 |
-
records (list or pd.DataFrame): List of OpenAlex record dictionaries or existing DataFrame
|
| 105 |
-
|
| 106 |
-
Returns:
|
| 107 |
-
pandas.DataFrame: Processed DataFrame with abstracts, publications, and titles
|
| 108 |
-
"""
|
| 109 |
-
# If records is already a DataFrame, use it directly
|
| 110 |
if isinstance(records, pd.DataFrame):
|
| 111 |
records_df = records.copy()
|
| 112 |
-
# Only process abstract_inverted_index and primary_location if they exist
|
| 113 |
-
if 'abstract_inverted_index' in records_df.columns:
|
| 114 |
-
records_df['abstract'] = [invert_abstract(t) for t in records_df['abstract_inverted_index']]
|
| 115 |
-
if 'primary_location' in records_df.columns:
|
| 116 |
-
records_df['parsed_publication'] = [get_pub(x) for x in records_df['primary_location']]
|
| 117 |
-
records_df['parsed_publication'] = records_df['parsed_publication'].fillna(' ') # fill missing values with space, only if we have them.
|
| 118 |
-
|
| 119 |
else:
|
| 120 |
-
# Process raw records as before
|
| 121 |
records_df = pd.DataFrame(records)
|
| 122 |
-
|
| 123 |
-
|
| 124 |
-
records_df[
|
| 125 |
-
|
| 126 |
-
|
| 127 |
-
|
| 128 |
-
|
| 129 |
-
|
| 130 |
-
|
| 131 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 132 |
return records_df
|
| 133 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 134 |
def openalex_url_to_filename(url):
|
| 135 |
-
"""
|
| 136 |
-
|
| 137 |
-
|
| 138 |
-
Args:
|
| 139 |
-
url (str): The OpenAlex search URL
|
| 140 |
-
|
| 141 |
-
Returns:
|
| 142 |
-
str: A filename-safe string with timestamp (without extension)
|
| 143 |
-
"""
|
| 144 |
-
from datetime import datetime
|
| 145 |
-
import re
|
| 146 |
-
|
| 147 |
-
# First parse the URL into query and params
|
| 148 |
-
parsed_url = urlparse(url)
|
| 149 |
-
query_params = parse_qs(parsed_url.query)
|
| 150 |
-
|
| 151 |
-
# Create parts of the filename
|
| 152 |
parts = []
|
| 153 |
-
|
| 154 |
-
|
| 155 |
-
|
| 156 |
-
|
| 157 |
-
|
| 158 |
-
|
| 159 |
-
|
| 160 |
-
|
| 161 |
-
|
| 162 |
-
|
| 163 |
-
|
| 164 |
-
|
| 165 |
-
|
| 166 |
-
|
| 167 |
-
|
| 168 |
-
|
| 169 |
-
if key == 'default_search':
|
| 170 |
-
parts.append(f"search_{clean_value}")
|
| 171 |
-
else:
|
| 172 |
-
parts.append(f"{key}_{clean_value}")
|
| 173 |
-
|
| 174 |
-
# Handle sort parameters
|
| 175 |
-
if 'sort' in query_params:
|
| 176 |
-
sort_params = query_params['sort'][0].split(',')
|
| 177 |
-
for s in sort_params:
|
| 178 |
-
if s.startswith('-'):
|
| 179 |
-
parts.append(f"sort_{s[1:].replace('.', '_')}_desc")
|
| 180 |
else:
|
| 181 |
-
parts.append(f"sort_{
|
| 182 |
-
|
| 183 |
-
|
| 184 |
-
|
| 185 |
-
|
| 186 |
-
# Combine all parts
|
| 187 |
-
filename = '__'.join(parts) if parts else 'openalex_query'
|
| 188 |
filename = f"{filename}__{timestamp}"
|
| 189 |
-
|
| 190 |
-
# Ensure filename is not too long (max 255 chars is common filesystem limit)
|
| 191 |
if len(filename) > 255:
|
| 192 |
-
filename = filename[:251]
|
| 193 |
-
|
| 194 |
return filename
|
| 195 |
|
|
|
|
| 196 |
def get_records_from_dois(doi_list, block_size=50):
|
| 197 |
-
"""
|
| 198 |
-
|
| 199 |
-
|
| 200 |
-
|
| 201 |
-
|
| 202 |
-
|
| 203 |
-
|
| 204 |
-
|
| 205 |
-
|
| 206 |
-
|
| 207 |
-
|
| 208 |
-
|
| 209 |
-
|
| 210 |
-
|
| 211 |
-
|
| 212 |
-
|
| 213 |
-
|
| 214 |
-
|
| 215 |
-
|
| 216 |
-
|
|
|
|
|
|
|
| 217 |
|
| 218 |
def openalex_url_to_readable_name(url):
|
| 219 |
-
"""
|
| 220 |
-
|
| 221 |
-
|
| 222 |
-
Args:
|
| 223 |
-
url (str): The OpenAlex search URL
|
| 224 |
-
|
| 225 |
-
Returns:
|
| 226 |
-
str: A short, human-readable description of the query
|
| 227 |
-
|
| 228 |
-
Examples:
|
| 229 |
-
- "Search: 'Kuramoto Model'"
|
| 230 |
-
- "Search: 'quantum physics', 2020-2023"
|
| 231 |
-
- "Cites: Popper (1959)"
|
| 232 |
-
- "From: University of Pittsburgh, 1999-2020"
|
| 233 |
-
- "By: Einstein, A., 1905-1955"
|
| 234 |
-
"""
|
| 235 |
-
import re
|
| 236 |
-
|
| 237 |
-
# Parse the URL
|
| 238 |
-
parsed_url = urlparse(url)
|
| 239 |
-
query_params = parse_qs(parsed_url.query)
|
| 240 |
-
|
| 241 |
-
# Initialize description parts
|
| 242 |
parts = []
|
| 243 |
year_range = None
|
| 244 |
-
|
| 245 |
-
|
| 246 |
-
|
| 247 |
-
|
| 248 |
-
|
| 249 |
-
|
| 250 |
-
|
| 251 |
-
|
| 252 |
-
|
| 253 |
-
key
|
| 254 |
-
|
| 255 |
-
|
| 256 |
-
|
| 257 |
-
|
| 258 |
-
|
| 259 |
-
|
| 260 |
-
|
| 261 |
-
|
| 262 |
-
|
| 263 |
-
|
| 264 |
-
|
| 265 |
-
|
| 266 |
-
|
| 267 |
-
|
| 268 |
-
|
| 269 |
-
|
| 270 |
-
|
| 271 |
-
year_range = f"{start_year}-{end_year}"
|
| 272 |
-
else:
|
| 273 |
-
year_range = value
|
| 274 |
-
|
| 275 |
-
elif key == 'cites':
|
| 276 |
-
# Look up the cited work to get author and year
|
| 277 |
-
work_id = value
|
| 278 |
-
try:
|
| 279 |
-
cited_work = Works()[work_id]
|
| 280 |
-
if cited_work:
|
| 281 |
-
# Get first author's last name
|
| 282 |
-
author_name = "Unknown"
|
| 283 |
-
year = "Unknown"
|
| 284 |
-
|
| 285 |
-
if cited_work.get('authorships') and len(cited_work['authorships']) > 0:
|
| 286 |
-
first_author = cited_work['authorships'][0]['author']
|
| 287 |
-
if first_author.get('display_name'):
|
| 288 |
-
# Extract last name (assuming "First Last" format)
|
| 289 |
-
name_parts = first_author['display_name'].split()
|
| 290 |
-
author_name = name_parts[-1] if name_parts else first_author['display_name']
|
| 291 |
-
|
| 292 |
-
if cited_work.get('publication_year'):
|
| 293 |
-
year = str(cited_work['publication_year'])
|
| 294 |
-
|
| 295 |
-
parts.append(f"Cites: {author_name} ({year})")
|
| 296 |
-
else:
|
| 297 |
-
parts.append(f"Cites: Work {work_id}")
|
| 298 |
-
except Exception as e:
|
| 299 |
-
print(f"Could not fetch cited work {work_id}: {e}")
|
| 300 |
-
parts.append(f"Cites: Work {work_id}")
|
| 301 |
-
|
| 302 |
-
elif key == 'authorships.institutions.lineage':
|
| 303 |
-
# Look up institution name
|
| 304 |
-
inst_id = value
|
| 305 |
-
try:
|
| 306 |
-
institution = Institutions()[inst_id]
|
| 307 |
-
if institution and institution.get('display_name'):
|
| 308 |
-
parts.append(f"From: {institution['display_name']}")
|
| 309 |
-
else:
|
| 310 |
-
parts.append(f"From: Institution {inst_id}")
|
| 311 |
-
except Exception as e:
|
| 312 |
-
print(f"Could not fetch institution {inst_id}: {e}")
|
| 313 |
-
parts.append(f"From: Institution {inst_id}")
|
| 314 |
-
|
| 315 |
-
elif key == 'authorships.author.id':
|
| 316 |
-
# Look up author name
|
| 317 |
-
author_id = value
|
| 318 |
-
try:
|
| 319 |
-
author = Authors()[author_id]
|
| 320 |
-
if author and author.get('display_name'):
|
| 321 |
-
parts.append(f"By: {author['display_name']}")
|
| 322 |
-
else:
|
| 323 |
-
parts.append(f"By: Author {author_id}")
|
| 324 |
-
except Exception as e:
|
| 325 |
-
print(f"Could not fetch author {author_id}: {e}")
|
| 326 |
-
parts.append(f"By: Author {author_id}")
|
| 327 |
-
|
| 328 |
-
elif key == 'type':
|
| 329 |
-
# Handle work types
|
| 330 |
-
type_mapping = {
|
| 331 |
-
'article': 'Articles',
|
| 332 |
-
'book': 'Books',
|
| 333 |
-
'book-chapter': 'Book Chapters',
|
| 334 |
-
'dissertation': 'Dissertations',
|
| 335 |
-
'preprint': 'Preprints'
|
| 336 |
-
}
|
| 337 |
-
work_type = type_mapping.get(value, value.replace('-', ' ').title())
|
| 338 |
-
parts.append(f"Type: {work_type}")
|
| 339 |
-
|
| 340 |
-
elif key == 'host_venue.id':
|
| 341 |
-
# Look up venue name
|
| 342 |
-
venue_id = value
|
| 343 |
-
try:
|
| 344 |
-
# For venues, we can use Works to get source info, but let's try a direct approach
|
| 345 |
-
# This might need adjustment based on pyalex API structure
|
| 346 |
-
parts.append(f"In: Venue {venue_id}") # Fallback
|
| 347 |
-
except Exception as e:
|
| 348 |
-
parts.append(f"In: Venue {venue_id}")
|
| 349 |
-
|
| 350 |
-
elif key.startswith('concepts.id'):
|
| 351 |
-
# Handle concept filters - these are topic/concept IDs
|
| 352 |
-
concept_id = value
|
| 353 |
-
parts.append(f"Topic: {concept_id}") # Could be enhanced with concept lookup
|
| 354 |
-
|
| 355 |
else:
|
| 356 |
-
|
| 357 |
-
|
| 358 |
-
|
| 359 |
-
|
| 360 |
-
|
| 361 |
-
|
| 362 |
-
|
| 363 |
-
|
| 364 |
-
|
| 365 |
-
|
| 366 |
-
|
| 367 |
-
|
| 368 |
-
|
| 369 |
-
|
| 370 |
-
|
| 371 |
-
|
| 372 |
-
|
| 373 |
-
|
| 374 |
-
|
| 375 |
-
|
| 376 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 377 |
if year_range:
|
| 378 |
-
if parts
|
| 379 |
-
description += f", {year_range}"
|
| 380 |
-
else:
|
| 381 |
-
description = f"Works from {year_range}"
|
| 382 |
-
|
| 383 |
-
# Limit length to keep it readable
|
| 384 |
if len(description) > 60:
|
| 385 |
description = description[:57] + "..."
|
| 386 |
-
|
| 387 |
-
return description
|
|
|
|
| 1 |
+
import ast
|
| 2 |
+
import json
|
| 3 |
+
import re
|
| 4 |
+
from datetime import datetime
|
| 5 |
+
|
| 6 |
import numpy as np
|
|
|
|
|
|
|
| 7 |
import pandas as pd
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 8 |
|
| 9 |
+
from openalex_client import get_openalex_client, normalize_openalex_url
|
| 10 |
+
|
| 11 |
|
| 12 |
+
def invert_abstract(inv_index):
|
| 13 |
+
"""Reconstruct abstract from OpenAlex' inverted-index."""
|
|
|
|
| 14 |
if isinstance(inv_index, str):
|
| 15 |
try:
|
| 16 |
+
inv_index = json.loads(inv_index)
|
| 17 |
except Exception:
|
| 18 |
try:
|
| 19 |
+
inv_index = ast.literal_eval(inv_index)
|
| 20 |
except Exception:
|
| 21 |
inv_index = None
|
| 22 |
|
| 23 |
if isinstance(inv_index, dict):
|
| 24 |
+
inv_list = [(word, pos) for word, positions in inv_index.items() for pos in positions]
|
| 25 |
+
return " ".join(word for word, _ in sorted(inv_list, key=lambda item: item[1]))
|
| 26 |
+
return " "
|
| 27 |
+
|
| 28 |
+
|
|
|
|
| 29 |
def get_pub(x):
|
| 30 |
"""Extract publication name from record."""
|
| 31 |
+
try:
|
| 32 |
+
source = x["source"]["display_name"]
|
| 33 |
+
if source not in ["parsed_publication", "Deleted Journal"]:
|
| 34 |
return source
|
| 35 |
+
return " "
|
| 36 |
+
except Exception:
|
| 37 |
+
return " "
|
| 38 |
+
|
| 39 |
|
| 40 |
def get_field(x):
|
| 41 |
"""Extract academic field from record."""
|
| 42 |
try:
|
| 43 |
+
field = x["primary_topic"]["subfield"]["display_name"]
|
| 44 |
if field is not None:
|
| 45 |
return field
|
| 46 |
+
return np.nan
|
| 47 |
+
except Exception:
|
|
|
|
| 48 |
return np.nan
|
| 49 |
|
| 50 |
+
|
| 51 |
def process_records_to_df(records):
|
| 52 |
+
"""Convert OpenAlex records to a pandas DataFrame with the expected mapper fields."""
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 53 |
if isinstance(records, pd.DataFrame):
|
| 54 |
records_df = records.copy()
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 55 |
else:
|
|
|
|
| 56 |
records_df = pd.DataFrame(records)
|
| 57 |
+
|
| 58 |
+
if "title" not in records_df.columns and "display_name" in records_df.columns:
|
| 59 |
+
records_df["title"] = records_df["display_name"]
|
| 60 |
+
|
| 61 |
+
if "title" not in records_df.columns:
|
| 62 |
+
records_df["title"] = " "
|
| 63 |
+
|
| 64 |
+
if "abstract" not in records_df.columns:
|
| 65 |
+
if "abstract_inverted_index" in records_df.columns:
|
| 66 |
+
records_df["abstract"] = [invert_abstract(value) for value in records_df["abstract_inverted_index"]]
|
| 67 |
+
else:
|
| 68 |
+
records_df["abstract"] = " "
|
| 69 |
+
|
| 70 |
+
if "parsed_publication" not in records_df.columns:
|
| 71 |
+
if "primary_location" in records_df.columns:
|
| 72 |
+
records_df["parsed_publication"] = [get_pub(value) for value in records_df["primary_location"]]
|
| 73 |
+
else:
|
| 74 |
+
records_df["parsed_publication"] = " "
|
| 75 |
+
|
| 76 |
+
records_df["abstract"] = records_df["abstract"].fillna(" ")
|
| 77 |
+
records_df["parsed_publication"] = records_df["parsed_publication"].fillna(" ")
|
| 78 |
+
records_df["title"] = records_df["title"].fillna(" ")
|
| 79 |
+
|
| 80 |
+
if "id" in records_df.columns:
|
| 81 |
+
records_df = records_df.drop_duplicates(subset=["id"]).reset_index(drop=True)
|
| 82 |
+
else:
|
| 83 |
+
records_df = records_df.reset_index(drop=True)
|
| 84 |
+
|
| 85 |
return records_df
|
| 86 |
|
| 87 |
+
|
| 88 |
+
def _clean_value(value):
|
| 89 |
+
clean_value = value.strip().strip("\"'")
|
| 90 |
+
clean_value = re.sub(r"[^\w\s-]", "", clean_value)
|
| 91 |
+
clean_value = " ".join(clean_value.split())
|
| 92 |
+
return clean_value
|
| 93 |
+
|
| 94 |
+
|
| 95 |
+
def _strip_quotes(value):
|
| 96 |
+
return value.strip().strip("\"'")
|
| 97 |
+
|
| 98 |
+
|
| 99 |
def openalex_url_to_filename(url):
|
| 100 |
+
"""Convert an OpenAlex URL to a filename-safe string with timestamp."""
|
| 101 |
+
query = normalize_openalex_url(url)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 102 |
parts = []
|
| 103 |
+
|
| 104 |
+
if query.params.get("search"):
|
| 105 |
+
search_value = _clean_value(query.params["search"]).replace(" ", "_")
|
| 106 |
+
if search_value:
|
| 107 |
+
parts.append(f"search_{search_value}")
|
| 108 |
+
|
| 109 |
+
for token in query.filter_tokens:
|
| 110 |
+
clean_key = token.key.replace(".", "_")
|
| 111 |
+
clean_value = _clean_value(token.value).replace(" ", "_")
|
| 112 |
+
if clean_value:
|
| 113 |
+
parts.append(f"{clean_key}_{clean_value}")
|
| 114 |
+
|
| 115 |
+
if query.params.get("sort"):
|
| 116 |
+
for sort_value in query.params["sort"].split(","):
|
| 117 |
+
if sort_value.startswith("-"):
|
| 118 |
+
parts.append(f"sort_{sort_value[1:].replace('.', '_')}_desc")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 119 |
else:
|
| 120 |
+
parts.append(f"sort_{sort_value.replace('.', '_')}_asc")
|
| 121 |
+
|
| 122 |
+
timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
|
| 123 |
+
filename = "__".join(parts) if parts else "openalex_query"
|
|
|
|
|
|
|
|
|
|
| 124 |
filename = f"{filename}__{timestamp}"
|
|
|
|
|
|
|
| 125 |
if len(filename) > 255:
|
| 126 |
+
filename = filename[:251]
|
|
|
|
| 127 |
return filename
|
| 128 |
|
| 129 |
+
|
| 130 |
def get_records_from_dois(doi_list, block_size=50):
|
| 131 |
+
"""Download OpenAlex records for a list of DOIs in blocks."""
|
| 132 |
+
client = get_openalex_client()
|
| 133 |
+
return pd.DataFrame(client.fetch_records_from_dois(doi_list, block_size=block_size))
|
| 134 |
+
|
| 135 |
+
|
| 136 |
+
def _lookup_display_name(entity, entity_id):
|
| 137 |
+
client = get_openalex_client()
|
| 138 |
+
try:
|
| 139 |
+
record = client.get_entity(entity, entity_id, select_fields=("display_name",))
|
| 140 |
+
except Exception:
|
| 141 |
+
return None
|
| 142 |
+
return record.get("display_name")
|
| 143 |
+
|
| 144 |
+
|
| 145 |
+
def _lookup_cited_work(entity_id):
|
| 146 |
+
client = get_openalex_client()
|
| 147 |
+
try:
|
| 148 |
+
cited_work = client.get_entity("works", entity_id, select_fields=("authorships", "publication_year"))
|
| 149 |
+
except Exception:
|
| 150 |
+
return None
|
| 151 |
+
return cited_work
|
| 152 |
+
|
| 153 |
|
| 154 |
def openalex_url_to_readable_name(url):
|
| 155 |
+
"""Convert an OpenAlex URL to a short, human-readable query description."""
|
| 156 |
+
query = normalize_openalex_url(url)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 157 |
parts = []
|
| 158 |
year_range = None
|
| 159 |
+
|
| 160 |
+
if query.params.get("search"):
|
| 161 |
+
parts.append(f"Search: '{_strip_quotes(query.params['search'])}'")
|
| 162 |
+
|
| 163 |
+
for token in query.filter_tokens:
|
| 164 |
+
key = token.key
|
| 165 |
+
value = token.value
|
| 166 |
+
|
| 167 |
+
try:
|
| 168 |
+
if key == "title_and_abstract.search":
|
| 169 |
+
parts.append(f"T&A: '{_strip_quotes(value)}'")
|
| 170 |
+
|
| 171 |
+
elif key == "publication_year":
|
| 172 |
+
year_range = value
|
| 173 |
+
|
| 174 |
+
elif key == "cites":
|
| 175 |
+
cited_work = _lookup_cited_work(value)
|
| 176 |
+
if cited_work:
|
| 177 |
+
author_name = "Unknown"
|
| 178 |
+
authorships = cited_work.get("authorships") or []
|
| 179 |
+
if authorships:
|
| 180 |
+
first_author = authorships[0].get("author") or {}
|
| 181 |
+
display_name = first_author.get("display_name")
|
| 182 |
+
if display_name:
|
| 183 |
+
author_name = display_name.split()[-1]
|
| 184 |
+
year = cited_work.get("publication_year") or "Unknown"
|
| 185 |
+
parts.append(f"Cites: {author_name} ({year})")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 186 |
else:
|
| 187 |
+
parts.append(f"Cites: Work {value}")
|
| 188 |
+
|
| 189 |
+
elif key == "authorships.institutions.lineage" and "|" not in value:
|
| 190 |
+
institution_name = _lookup_display_name("institutions", value)
|
| 191 |
+
parts.append(f"From: {institution_name or f'Institution {value}'}")
|
| 192 |
+
|
| 193 |
+
elif key == "authorships.author.id" and "|" not in value:
|
| 194 |
+
author_name = _lookup_display_name("authors", value)
|
| 195 |
+
parts.append(f"By: {author_name or f'Author {value}'}")
|
| 196 |
+
|
| 197 |
+
elif key == "primary_location.source.id" and "|" not in value:
|
| 198 |
+
source_name = _lookup_display_name("sources", value)
|
| 199 |
+
parts.append(f"In: {source_name or f'Source {value}'}")
|
| 200 |
+
|
| 201 |
+
elif key == "topics.id" and "|" not in value:
|
| 202 |
+
topic_name = _lookup_display_name("topics", value)
|
| 203 |
+
parts.append(f"Topic: {topic_name or value}")
|
| 204 |
+
|
| 205 |
+
elif key == "concepts.id" and "|" not in value:
|
| 206 |
+
concept_name = _lookup_display_name("concepts", value)
|
| 207 |
+
parts.append(f"Concept: {concept_name or value}")
|
| 208 |
+
|
| 209 |
+
elif key == "type":
|
| 210 |
+
type_mapping = {
|
| 211 |
+
"article": "Articles",
|
| 212 |
+
"book": "Books",
|
| 213 |
+
"book-chapter": "Book Chapters",
|
| 214 |
+
"dissertation": "Dissertations",
|
| 215 |
+
"preprint": "Preprints",
|
| 216 |
+
}
|
| 217 |
+
parts.append(f"Type: {type_mapping.get(value, value.replace('-', ' ').title())}")
|
| 218 |
+
|
| 219 |
+
else:
|
| 220 |
+
clean_key = key.replace("_", " ").replace(".", " ").title()
|
| 221 |
+
clean_value = value.replace("_", " ")
|
| 222 |
+
parts.append(f"{clean_key}: {clean_value}")
|
| 223 |
+
|
| 224 |
+
except Exception:
|
| 225 |
+
continue
|
| 226 |
+
|
| 227 |
+
description = "OpenAlex Query" if not parts else ", ".join(parts)
|
| 228 |
if year_range:
|
| 229 |
+
description = f"{description}, {year_range}" if parts else f"Works from {year_range}"
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 230 |
if len(description) > 60:
|
| 231 |
description = description[:57] + "..."
|
| 232 |
+
return description
|
|
|
requirements.txt
CHANGED
|
@@ -4,7 +4,6 @@ uvicorn
|
|
| 4 |
fastapi
|
| 5 |
numpy
|
| 6 |
requests
|
| 7 |
-
pyalex
|
| 8 |
compress-pickle
|
| 9 |
transformers
|
| 10 |
adapters
|
|
|
|
| 4 |
fastapi
|
| 5 |
numpy
|
| 6 |
requests
|
|
|
|
| 7 |
compress-pickle
|
| 8 |
transformers
|
| 9 |
adapters
|