##// END OF EJS Templates
v2.9.2 :: Update 'User-Agent' in URL parameters - download
eynilupu -
r23:7969fa062c2f master
parent child
Show More
@@ -1,514 +1,515
1 1 from ckanapi import RemoteCKAN
2 2 from datetime import datetime
3 3 from jrodb import download
4 4 from jrodb import resource
5 5 #from ckanapi.errors import NotAuthorized, NotFound, ValidationError, SearchQueryError, SearchError, CKANAPIError, ServerIncompatibleError
6 6 import sys
7 7 import platform
8 8 import os
9 9 import requests
10 10
11 11 class Api():
12 12 """
13 13 FINALIDAD:
14 14 Script para administrar y obtener la data del repositorio por medio de APIs.
15 15
16 16 REQUISITIOS PREVIOS:
17 17 - Paso 1: Tener "pip [Python 2]" o "pip3 [Python 3]" instalado:
18 18 - Paso 2: Instalar los siguientes paquetes:
19 19 En Python 2
20 20 - pip install -e git+http://intranet.igp.gob.pe:8082/DATABASES/ckanext-jro/api-cliente#egg=jrodb
21 21 En Python 3
22 22 - pip3 install -e git+http://intranet.igp.gob.pe:8082/DATABASES/ckanext-jro/api-cliente#egg=jrodb
23 23
24 24 FUNCIONES DISPONIBLES:
25 25 - action
26 26 - show
27 27 - search
28 28 - create
29 29 - patch
30 30 - delete
31 31 - download
32 32
33 33 EJEMPLOS:
34 34 #1:
35 35 with Api('http://demo.example.com', Authorization='#########') as <access_name>:
36 36 ... some operation(s) ...
37 37 #2:
38 38 <access_name> = Api('http://example.com', Authorization='#########')
39 39 ... some operation(s) ...
40 40 <access_name>.ckan.close()
41 41
42 42 REPORTAR ALGUN PROBLEMA:
43 43 Debe enviar un correo a eynilupu@igp.gob.pe detallando los siguientes pasos:
44 44 1) Correo para contactarlo
45 45 2) Descripcion del problema
46 46 3) ¿En que paso o seccion encontro el problema?
47 47 4) ¿Cual era el resultado que usted esperaba?
48 48 """
49 49 def __init__(self, url, Authorization=None, secure=True):
50 50 #-------- Check Secure -------#
51 51 self.verify = secure
52 52 if not secure and isinstance(secure, bool):
53 53 session = requests.Session()
54 54 session.verify = False
55 55 else:
56 56 session = None
57 57 #------------------------------#
58 58 self.url = url
59 ua = 'CKAN_JRO/2.9.2 (+'+str(self.url)+')'
59 #ua = 'CKAN_JRO/2.9.2 (+'+str(self.url)+')'
60 self.ua = 'CKAN_JRO/2.9.2 (+http://intranet.igp.gob.pe:8082/DATABASES/ckanext-jro/api-cliente)'
60 61 #ua = 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_10_1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/39.0.2171.95 Safari/537.36'
61 self.ckan = RemoteCKAN(self.url, apikey=Authorization, user_agent=ua, session=session)
62 self.ckan = RemoteCKAN(self.url, apikey=Authorization, user_agent=self.ua, session=session)
62 63 #self.ckan = RemoteCKAN(self.url, apikey=Authorization)
63 64 self.Authorization = Authorization
64 65 # Change for --> self.separator = os.sep
65 66 if platform.system() == 'Windows':
66 67 self.separator = '\\'
67 68 else:
68 69 self.separator = '/'
69 70
70 71 self.chunk_size = 1024
71 72 self.list = []
72 73 self.dict = {}
73 74 self.str = ''
74 75 self.check = 1
75 76 self.cont = 0
76 77
77 78 def __enter__(self):
78 79 return self
79 80
80 81 def __exit__(self, *args):
81 82 self.ckan.close()
82 83
83 84 def action(self, action, **kwargs):
84 85 """
85 86 FINALIDAD:
86 87 Funcion para llamar a las APIs disponibles
87 88
88 89 APIs DISPONIBLES:
89 90 CONSULTAR: "GUIA DE SCRIPT.pdf"
90 91
91 92 EJEMPLO:
92 93 <access_name>.action(<consuming API>, param_1 = <class 'param_1'>, ...)
93 94 """
94 95 #--------------- CASE: PACKAGE SEARCH ---------------#
95 96 if kwargs is not None:
96 97 if action == 'package_search':
97 98 self.list = ['facet_mincount', 'facet_limit', 'facet_field']
98 99 for facet in self.list:
99 100 if facet in kwargs:
100 101 kwargs[facet.replace('_', '.')] = kwargs[facet]
101 102 kwargs.pop(facet)
102 103 #----------------------------------------------------#
103 104 try:
104 105 return getattr(self.ckan.action, action)(**kwargs)
105 106 except:
106 107 _, exc_value, _ = sys.exc_info()
107 108 return exc_value
108 109
109 110 def show(self, type_option, id, **kwargs):
110 111 '''
111 112 FINALIDAD:
112 113 Funcion personalizada para una busqueda en especifico.
113 114
114 115 PARAMETROS DISPONIBLES:
115 116 CONSULTAR: "GUIA DE SCRIPT.pdf"
116 117
117 118 ESTRUCTURA:
118 119 <access_name>.show(type_option = <class 'str'>, id = <class 'str'>, param_1 = <class 'param_1'>, ...)
119 120 '''
120 121 if type(type_option) is str:
121 122 try:
122 123 if type_option == 'dataset':
123 124 return getattr(self.ckan.action, 'package_show')(id=id, **kwargs)
124 125 elif type_option == 'resource':
125 126 return getattr(self.ckan.action, 'resource_show')(id=id, **kwargs)
126 127 elif type_option == 'project':
127 128 return getattr(self.ckan.action, 'organization_show')(id=id, **kwargs)
128 129 elif type_option == 'collaborator':
129 130 return getattr(self.ckan.action, 'package_collaborator_list_for_user')(id=id, **kwargs)
130 131 elif type_option == 'member':
131 132 return getattr(self.ckan.action, 'organization_list_for_user')(id=id, **kwargs)
132 133 elif type_option == 'vocabulary':
133 134 return getattr(self.ckan.action, 'vocabulary_show')(id=id, **kwargs)
134 135 elif type_option == 'tag':
135 136 if not 'vocabulary_id' in kwargs:
136 137 print('Missing "vocabulary_id" value: assume it is a free tag')
137 138 return getattr(self.ckan.action, 'tag_show')(id=id, **kwargs)
138 139 elif type_option == 'user':
139 140 return getattr(self.ckan.action, 'user_show')(id=id, **kwargs)
140 141 elif type_option == 'job':
141 142 return getattr(self.ckan.action, 'job_show')(id=id, **kwargs)
142 143 else:
143 144 return 'ERROR:: "type_option = %s" is not accepted' % (type_option)
144 145 except:
145 146 _, exc_value, _ = sys.exc_info()
146 147 return exc_value
147 148 else:
148 149 return 'ERROR:: "type_option" must be a str'
149 150
150 151 def search(self, type_option, query=None, **kwargs):
151 152 '''
152 153 FINALIDAD:
153 154 Funcion personalizada para busquedas que satisfagan algun criterio.
154 155
155 156 PARAMETROS DISPONIBLES:
156 157 CONSULTAR: "GUIA DE SCRIPT.pdf"
157 158
158 159 ESTRUCTURA:
159 160 <access_name>.search(type_option = <class 'str'>, query = <class 'dict'>, param_1 = <class 'param_1'>, ...)
160 161 '''
161 162 if type(type_option) is str:
162 163 try:
163 164 if type_option == 'dataset':
164 165 key_replace = ['fq', 'fq_list', 'include_private']
165 166 key_point = ['facet_mincount', 'facet_limit', 'facet_field']
166 167 for key1, value1 in kwargs.items():
167 168 if not key1 in key_replace:
168 169 if key1 in key_point:
169 170 self.dict[key1.replace('_', '.')] = value1
170 171 else:
171 172 self.dict[key1] = value1
172 173
173 174 if query is not None:
174 175 if type(query) is dict:
175 176 self.dict['fq_list'] = []
176 177 #NUM_RESOURCES_MIN / NUM_RESOURCES_MAX
177 178 #----------------------------------------------------#
178 179 if 'dataset_start_date' in query:
179 180 if type(query['dataset_start_date']) is str:
180 181 try:
181 182 datetime.strptime(query['dataset_start_date'], '%Y-%m-%d')
182 183 if len(query['dataset_start_date']) != 10:
183 184 return '"dataset_start_date", must be: <YYYY-MM-DD>'
184 185 self.dict['fq_list'].append('dataset_start_date:"'+query['dataset_start_date']+'"')
185 186 self.list.append('dataset_start_date')
186 187 except:
187 188 return '"dataset_start_date" incorrect: "%s"' % (query['dataset_start_date'])
188 189 else:
189 190 return '"dataset_start_date" must be <str>'
190 191 #----------------------------------------------------#
191 192 if 'dataset_end_date' in query:
192 193 if type(query['dataset_end_date']) is str:
193 194 try:
194 195 datetime.strptime(query['dataset_end_date'], '%Y-%m-%d')
195 196 if len(query['dataset_end_date']) != 10:
196 197 return '"dataset_end_date", must be: <YYYY-MM-DD>'
197 198
198 199 if 'dataset_start_date' in query:
199 200 if query['dataset_start_date'] > query['dataset_end_date']:
200 201 return '"dataset_end_date" must be greater than "dataset_start_date"'
201 202
202 203 self.dict['fq_list'].append('dataset_end_date:"'+query['dataset_end_date']+'"')
203 204 self.list.append('dataset_end_date')
204 205 except:
205 206 return '"dataset_end_date" incorrect: "%s"' % (query['dataset_end_date'])
206 207 else:
207 208 return '"dataset_end_date" must be <str>'
208 209 #----------------------------------------------------#
209 210 if 'tags' in query:
210 211 if isinstance(query['tags'], (int, float, str, list)):
211 212 if type(query['tags']) is list:
212 213 for u in query['tags']:
213 214 self.dict['fq_list'].append('tags:"'+str(u)+'"')
214 215 else:
215 216 self.dict['fq_list'].append('tags:"'+str(query['tags'])+'"')
216 217
217 218 self.list.append('tags')
218 219 else:
219 220 return '"tags" must be <list> or <float> or <int> or <str>'
220 221 #----------------------------------------------------#
221 222 for key, value in query.items():
222 223 if value is not None and not key in self.list:
223 224 self.dict['fq_list'].append(str(key)+':"'+str(value)+'"')
224 225 else:
225 226 return '"query" must be <dict>'
226 227
227 228 return getattr(self.ckan.action, 'package_search')(include_private=True, **self.dict)
228 229
229 230 elif type_option == 'resource':
230 231 for key1, value1 in kwargs.items():
231 232 if key1 != 'fields':
232 233 self.dict[key1] = value1
233 234
234 235 if query is not None:
235 236 if type(query) is dict:
236 237 #----------------------------------------------------#
237 238 if 'file_date_min' in query:
238 239 if type(query['file_date_min']) is str:
239 240 try:
240 241 datetime.strptime(query['file_date_min'], '%Y-%m-%d')
241 242 if len(query['file_date_min']) != 10:
242 243 return '"file_date_min", must be: <YYYY-MM-DD>'
243 244 except:
244 245 return '"file_date_min" incorrect: "%s"' % (query['file_date_min'])
245 246 else:
246 247 return '"file_date_min" must be <str>'
247 248 #----------------------------------------------------#
248 249 if 'file_date_max' in query:
249 250 if type(query['file_date_max']) is str:
250 251 try:
251 252 datetime.strptime(query['file_date_max'], '%Y-%m-%d')
252 253 if len(query['file_date_max']) != 10:
253 254 return '"file_date_max", must be: <YYYY-MM-DD>'
254 255
255 256 if 'file_date_min' in query:
256 257 if query['file_date_min'] > query['file_date_max']:
257 258 return '"file_date_max" must be greater than "file_date_min"'
258 259 except:
259 260 return '"file_date_max" incorrect: "%s"' % (query['file_date_max'])
260 261 else:
261 262 return '"file_date_max" must be <str>'
262 263 #----------------------------------------------------#
263 264 self.dict['query'] = query
264 265 else:
265 266 return '"query" must be <dict>'
266 267 return getattr(self.ckan.action, 'resources_search')(**self.dict)
267 268
268 269 elif type_option == 'tag':
269 270 for key1, value1 in kwargs.items():
270 271 if key1 != 'fields':
271 272 self.dict[key1] = value1
272 273
273 274 if not 'vocabulary_id' in kwargs:
274 275 print('Missing "vocabulary_id" value: tags that don’t belong to any vocabulary')
275 276 else:
276 277 print('Only tags that belong to "{}" vocabulary'.format(kwargs['vocabulary_id']))
277 278
278 279 if query is not None:
279 280 if type(query) is dict:
280 281 if 'search' in query:
281 282 if type(query['search']) is list or type(query['search']) is str:
282 283 self.dict['query'] = query['search']
283 284 else:
284 285 return '"search" must be <list> or <str>'
285 286 else:
286 287 return '"query" must be <dict>'
287 288 return getattr(self.ckan.action, 'tag_search')(**self.dict)
288 289
289 290 else:
290 291 return 'ERROR:: "type_option = %s" is not accepted' % (type_option)
291 292
292 293 except:
293 294 _, exc_value, _ = sys.exc_info()
294 295 return exc_value
295 296 else:
296 297 return 'ERROR:: "type_option" must be <str>'
297 298
298 299 def create(self, type_option, select=None, **kwargs):
299 300 '''
300 301 FINALIDAD:
301 302 Funcion personalizada para crear.
302 303
303 304 PARAMETROS DISPONIBLES:
304 305 CONSULTAR: "GUIA DE SCRIPT.pdf"
305 306
306 307 ESTRUCTURA:
307 308 <access_name>.create(type_option = <class 'str'>, param_1 = <class 'param_1'>, ...)
308 309 '''
309 310 if type(type_option) is str:
310 311 try:
311 312 if type_option == 'dataset':
312 313 return getattr(self.ckan.action, 'package_create')(**kwargs)
313 314 if type_option == 'resource':
314 315 return resource.resource_create(self, **kwargs)
315 316 elif type_option == 'project':
316 317 return getattr(self.ckan.action, 'organization_create')(**kwargs)
317 318 elif type_option == 'member':
318 319 return getattr(self.ckan.action, 'organization_member_create')(**kwargs)
319 320 elif type_option == 'collaborator':
320 321 return getattr(self.ckan.action, 'package_collaborator_create')(**kwargs)
321 322 elif type_option == 'vocabulary':
322 323 return getattr(self.ckan.action, 'vocabulary_create')(**kwargs)
323 324 elif type_option == 'tag':
324 325 return getattr(self.ckan.action, 'tag_create')(**kwargs)
325 326 elif type_option == 'user':
326 327 return getattr(self.ckan.action, 'user_create')(**kwargs)
327 328 elif type_option == 'views':
328 329 if 'resource' == select:
329 330 self.list = ['package']
330 331 for key1, value1 in kwargs.items():
331 332 if not key1 in self.list:
332 333 self.dict[key1] = value1
333 334 return getattr(self.ckan.action, 'resource_create_default_resource_views')(**self.dict)
334 335 elif 'dataset' == select:
335 336 return getattr(self.ckan.action, 'package_create_default_resource_views')(**kwargs)
336 337 else:
337 338 return 'ERROR:: "select = %s" is not accepted' % (select)
338 339 else:
339 340 return 'ERROR:: "type_option = %s" is not accepted' % (type_option)
340 341 except:
341 342 _, exc_value, _ = sys.exc_info()
342 343 return exc_value
343 344 else:
344 345 return 'ERROR:: "type_option" must be <str>'
345 346
346 347 def patch(self, type_option, **kwargs):
347 348 '''
348 349 FINALIDAD:
349 350 Funciones personalizadas para actualizar
350 351
351 352 PARAMETROS DISPONIBLES:
352 353 CONSULTAR: "GUIA DE SCRIPT.pdf"
353 354
354 355 ESTRUCTURA:
355 356 <access_name>.patch(type_option = <class 'str'>, param_1 = <class 'param_1'>, ...)
356 357 '''
357 358 if type(type_option) is str:
358 359 try:
359 360 if type_option == 'dataset':
360 361 #Agregar que solo se debe modificar parámetros del Dataset y que no incluya Resources
361 362 return getattr(self.ckan.action, 'package_patch')(**kwargs)
362 363 elif type_option == 'project':
363 364 return getattr(self.ckan.action, 'organization_patch')(**kwargs)
364 365 elif type_option == 'resource':
365 366 return resource.resource_patch(self, **kwargs)
366 367 elif type_option == 'member':
367 368 return getattr(self.ckan.action, 'organization_member_create')(**kwargs)
368 369 elif type_option == 'collaborator':
369 370 return getattr(self.ckan.action, 'package_collaborator_create')(**kwargs)
370 371 else:
371 372 return 'ERROR:: "type_option = %s" is not accepted' % (type_option)
372 373 except:
373 374 _, exc_value, _ = sys.exc_info()
374 375 return exc_value
375 376 else:
376 377 return 'ERROR:: "type_option" must be <str>'
377 378
378 379 def delete(self, type_option, select=None, **kwargs):
379 380 '''
380 381 FINALIDAD:
381 382 Función personalizada para eliminar y/o purgar.
382 383
383 384 PARAMETROS DISPONIBLES:
384 385 CONSULTAR: "GUIA DE SCRIPT.pdf"
385 386
386 387 ESTRUCTURA:
387 388 <access_name>.delete(type_option = <class 'str'>, param_1 = <class 'param_1'>, ...)
388 389 '''
389 390 if type(type_option) is str:
390 391 try:
391 392 if type_option == 'dataset':
392 393 if select is None:
393 394 return 'ERROR:: "select" must not be "None"'
394 395 else:
395 396 if 'delete' == select:
396 397 return getattr(self.ckan.action, 'package_delete')(**kwargs)
397 398 elif 'purge' == select:
398 399 return getattr(self.ckan.action, 'dataset_purge')(**kwargs)
399 400 else:
400 401 return 'ERROR:: "select = %s" is not accepted' % (select)
401 402 elif type_option == 'project':
402 403 if select is None:
403 404 return 'ERROR:: "select" must not be "None"'
404 405 else:
405 406 if 'delete' == select:
406 407 return getattr(self.ckan.action, 'organization_delete')(**kwargs)
407 408 elif 'purge' == select:
408 409 return getattr(self.ckan.action, 'organization_purge')(**kwargs)
409 410 else:
410 411 return 'ERROR:: "select = %s" is not accepted' % (select)
411 412 elif type_option == 'resource':
412 413 if select is None:
413 414 return 'ERROR:: "select" must not be "None"'
414 415 else:
415 416 return resource.resource_delete(self, select, **kwargs)
416 417 elif type_option == 'vocabulary':
417 418 return getattr(self.ckan.action, 'vocabulary_delete')(**kwargs)
418 419 elif type_option == 'tag':
419 420 return getattr(self.ckan.action, 'tag_delete')(**kwargs)
420 421 elif type_option == 'user':
421 422 return getattr(self.ckan.action, 'user_delete')(**kwargs)
422 423 else:
423 424 return 'ERROR:: "type_option = %s" is not accepted' % (type_option)
424 425 except:
425 426 _, exc_value, _ = sys.exc_info()
426 427 return exc_value
427 428 else:
428 429 return 'ERROR:: "type_option" must be <str>'
429 430
430 431 def download(self, id, processes=1, path=os.path.expanduser("~"), **kwargs):
431 432 '''
432 433 FINALIDAD:
433 434 Funcion personalizada avanzada para la descarga de archivos existentes de un(os) dataset(s).
434 435
435 436 PARAMETROS DISPONIBLES:
436 437 CONSULTAR: "GUIA DE SCRIPT.pdf"
437 438
438 439 ESTRUCTURA:
439 440 <access_name>.download(id = <class 'str' or 'list'>, param_1 = <class 'param_1'>, ...)
440 441 '''
441 442 #------------------ PATH ----------------------#
442 443 if isinstance(path, str):
443 444 if os.path.isdir(path):
444 445 if not path.endswith(os.sep):
445 446 path = path + os.sep
446 447 test_txt = path + datetime.now().strftime("%Y_%m_%d_%H_%M_%S_%f")+'.txt'
447 448 try:
448 449 file_txt = open(test_txt, 'w')
449 450 file_txt.close()
450 451 os.remove(test_txt)
451 452 except:
452 453 return 'ERROR:: Access denied, you are not authorized to write files: "%s"' % (path)
453 454 else:
454 455 return 'ERROR:: "path" does not exist'
455 456 else:
456 457 return 'ERROR:: "path" must be: <class "str">'
457 458
458 459 #------------------ PROCESSES -----------------#
459 460 if not isinstance(processes, int):
460 461 return 'ERROR:: "processes" must be: <class "int">'
461 462
462 463 #------------------ ID OR NAME ----------------#
463 464 if isinstance(id, str):
464 465 id = [id]
465 466 elif isinstance(id, list):
466 467 id = list(map(str, id))
467 468 else:
468 469 return 'ERROR:: dataset "id" must be: <class "str" or "list">'
469 470 #----------------------------------------------#
470 471 arguments = {
471 472 '--apikey': self.Authorization,
472 473 '--ckan-user': None,
473 474 '--config': None,
474 475 '--datapackages': path,
475 476 '--datastore-fields': False,
476 477 '--get-request': False,
477 478 '--insecure': not self.verify,
478 479 '--processes': str(processes),
479 480 '--quiet': False,
480 481 '--remote': self.url,
481 482 '--worker': False,
482 483 #'--log': 'log.txt',
483 484 #'--all': False,
484 485 #'--gzip': False,
485 486 #'--output': None,
486 487 #'--max-records': None,
487 488 #'--output-json': False,
488 489 #'--output-jsonl': False,
489 490 #'--create-only': False,
490 491 #'--help': False,
491 492 #'--input': None,
492 493 #'--input-json': False,
493 494 #'--start-record': '1',
494 495 #'--update-only': False,
495 496 #'--upload-logo': False,
496 497 #'--upload-resources': False,
497 498 #'--version': False,
498 499 'ID_OR_NAME': id,
499 500 'datasets': True,
500 501 'dump': True,
501 502 #'ACTION_NAME': None,
502 503 #'KEY:JSON': [],
503 504 #'KEY=STRING': [],
504 505 #'KEY@FILE': [],
505 506 #'action': False,
506 507 #'delete': False,
507 508 #'groups': False,
508 509 #'load': False,
509 510 #'organizations': False,
510 511 #'related': False,
511 512 #'search': False,
512 513 #'users': False
513 514 }
514 return download.dump_things_change(self.ckan, 'datasets', arguments, **kwargs) No newline at end of file
515 return download.dump_things_change(self.ckan, 'datasets', arguments, self.ua, **kwargs) No newline at end of file
@@ -1,236 +1,236
1 1 #from ckanapi.datapackage import populate_schema_from_datastore
2 2 from ckanapi.cli import workers, dump
3 3 from ckanapi.cli.utils import pretty_json, completion_stats, compact_json, quiet_int_pipe
4 4 from datetime import datetime
5 5 from tqdm import tqdm
6 6 import sys
7 7 import json
8 8 import os
9 9 import requests
10 10 import six
11 11
12 12 if sys.version_info.major == 3:
13 13 from urllib.parse import urlparse
14 14 else:
15 15 import urlparse
16 16
17 17 DL_CHUNK_SIZE = 100 * 1024
18 18
19 def dump_things_change(ckan, thing, arguments, worker_pool=None, stdout=None, stderr=None, **kwargs):
19 def dump_things_change(ckan, thing, arguments, ua, worker_pool=None, stdout=None, stderr=None, **kwargs):
20 20 if worker_pool is None:
21 21 worker_pool = workers.worker_pool
22 22 if stdout is None:
23 23 stdout = getattr(sys.__stdout__, 'buffer', sys.__stdout__)
24 24 if stderr is None:
25 25 stderr = getattr(sys.stderr, 'buffer', sys.stderr)
26 26
27 27 if arguments['--worker']:
28 28 return dump.dump_things_worker(ckan, thing, arguments)
29 29 '''
30 30 log = None
31 31 if arguments['--log']:
32 32 log = open(arguments['--log'], 'a')
33 33 '''
34 34 jsonl_output = stdout
35 35 if arguments['--datapackages']:
36 36 jsonl_output = open(os.devnull, 'wb')
37 37
38 38 names = arguments['ID_OR_NAME']
39 39
40 40 if names and isinstance(names[0], dict):
41 41 names = [rec.get('name',rec.get('id')) for rec in names]
42 42 '''
43 43 if arguments['--datapackages']:
44 44 arguments['--datastore-fields'] = True
45 45 '''
46 46 #----------------------------#
47 47 filtered_urls = {}
48 48 for val in names:
49 49 try:
50 50 filtered_urls[val] = getattr(ckan.action, 'url_resources')(id=val, **kwargs)
51 51 except:
52 52 _, exc_value, _ = sys.exc_info()
53 53 return exc_value
54 54 #----------------------------#
55 55
56 56 cmd = dump._worker_command_line(thing, arguments)
57 57 processes = int(arguments['--processes'])
58 58 if hasattr(ckan, 'parallel_limit'):
59 59 processes = min(processes, ckan.parallel_limit)
60 60 stats = completion_stats(processes)
61 61 pool = worker_pool(cmd, processes, enumerate(compact_json(n) + b'\n' for n in names))
62 62
63 63 results = {}
64 64 expecting_number = 0
65 65 with quiet_int_pipe() as errors:
66 66 for job_ids, finished, result in pool:
67 67 if not result:
68 68 return 1
69 69 timestamp, error, record = json.loads(result.decode('utf-8'))
70 70 results[finished] = record
71 71
72 72 #----------------------------------------#
73 73 datapackages_path = arguments['--datapackages']
74 74 datapackage_dir = name_no_repetition(record.get('name', ''), datapackages_path)
75 75 #----------------------------------------#
76 76 if not arguments['--quiet']:
77 77 stderr.write('** Finished: {0} | Job IDs: {1} | Next Report: {2} | Error: {3} | Path: {4} | Dataset Name: {5}\n'.format(
78 78 finished,
79 79 job_ids,
80 80 next(stats),
81 81 error,
82 82 datapackage_dir,
83 83 record.get('name', '') if record else '',
84 84 ).encode('utf-8'))
85 85 '''
86 86 if log:
87 87 log.write(compact_json([
88 88 timestamp,
89 89 finished,
90 90 error,
91 91 record.get('name', '') if record else None,
92 92 ]) + b'\n')
93 93 '''
94 94 if datapackages_path:
95 95 try:
96 96 filter_url = filtered_urls[record.get('name', '')]
97 97 except:
98 98 filter_url = filtered_urls[record.get('id', '')]
99 create_datapackage_change(record, filter_url, datapackage_dir, stderr, arguments['--apikey'], arguments['--remote'], arguments['--insecure'])
99 create_datapackage_change(record, filter_url, datapackage_dir, stderr, arguments['--apikey'], arguments['--remote'], arguments['--insecure'], ua)
100 100
101 101 while expecting_number in results:
102 102 record = results.pop(expecting_number)
103 103 if record:
104 104 jsonl_output.write(compact_json(record, sort_keys=True) + b'\n')
105 105 expecting_number += 1
106 106 if 'pipe' in errors:
107 107 return 1
108 108 if 'interrupt' in errors:
109 109 return 2
110 110
111 def create_datapackage_change(record, filtered_url, datapackage_dir, stderr, apikey, host_url, insecure):
111 def create_datapackage_change(record, filtered_url, datapackage_dir, stderr, apikey, host_url, insecure, ua):
112 112 resource_formats_to_ignore = ['API', 'api']
113 113
114 114 os.makedirs(os.path.join(datapackage_dir, 'data'))
115 115 record['path'] = datapackage_dir
116 116
117 117 ckan_resources = []
118 118 for resource in tqdm(record.get('resources', []), unit_scale=True):
119 119 #for resource in record.get('resources', []):
120 120 if resource['format'] in resource_formats_to_ignore:
121 121 continue
122 122
123 123 if not {'name': resource['name'], 'url': resource['url']} in filtered_url:
124 124 continue
125 125
126 126 if len(resource['url']) == 0:
127 127 continue
128 128
129 129 filename = name_no_repetition(resource['name'], os.path.join(datapackage_dir, 'data'), 'resource')
130 130 resource['path'] = os.path.join(datapackage_dir, 'data', filename)
131 131
132 cres = create_resource_change(resource, stderr, apikey, host_url, insecure)
132 cres = create_resource_change(resource, stderr, apikey, host_url, insecure, ua)
133 133 if not cres:
134 134 continue
135 135 '''
136 136 #----------------------------------------#
137 137 dres = {'path': os.path.join('data', filename),
138 138 'description': cres.get('description', ''),
139 139 'format': cres.get('format', ''),
140 140 'name': cres.get('name', ''),
141 141 'title': cres.get('name', '').title()}
142 142 #----------------------------------------#
143 143 populate_schema_from_datastore(cres, dres)
144 144 '''
145 145 ckan_resources.append(resource)
146 146
147 147 dataset = dict(record, resources=ckan_resources)
148 148 datapackage = dataset_to_datapackage_change(dataset)
149 149
150 150 json_path = os.path.join(datapackage_dir, 'datapackage.json')
151 151 with open(json_path, 'wb') as out:
152 152 out.write(pretty_json(datapackage))
153 153
154 154 return datapackage_dir, datapackage, json_path
155 155
156 def create_resource_change(resource, stderr, apikey, host_url, insecure):
156 def create_resource_change(resource, stderr, apikey, host_url, insecure, ua):
157 157 # ---------- REPLACE URL --------- #
158 158 if urlparse(host_url).netloc != 'www.igp.gob.pe' and urlparse(resource['url']).netloc == 'www.igp.gob.pe':
159 159 resource['url'] = resource['url'].replace(urlparse(resource['url']).scheme + '://' + urlparse(resource['url']).netloc,
160 160 urlparse(host_url).scheme + '://' + urlparse(host_url).netloc)
161 161 #----------------------------------#
162 162 try:
163 r = requests.get(resource['url'], headers={'Authorization': apikey}, stream=True, verify=not insecure)
163 r = requests.get(resource['url'], headers={'Authorization': apikey, 'User-Agent': ua}, stream=True, verify=not insecure)
164 164 #---------------------------------------#
165 165 try:
166 166 r.raise_for_status()
167 167 except requests.exceptions.HTTPError as e:
168 168 return False
169 169 #---------------------------------------#
170 170 with open(resource['path'], 'wb') as f:
171 171 for chunk in r.iter_content(chunk_size=DL_CHUNK_SIZE):
172 172 if chunk:
173 173 f.write(chunk)
174 174
175 175 except requests.ConnectionError:
176 176 stderr.write('URL {0} refused connection. The resource will not be downloaded\n'.format(resource['url']).encode('utf-8'))
177 177 except requests.exceptions.RequestException as e:
178 178 stderr.write('{0}\n'.format(str(e.args[0]) if len(e.args) > 0 else '').encode('utf-8'))
179 179 except Exception as e:
180 180 stderr.write('{0}'.format(str(e.args[0]) if len(e.args) > 0 else '').encode('utf-8'))
181 181 return resource
182 182
183 183 def dataset_to_datapackage_change(dataset_dict):
184 184 dp = {'name': dataset_dict['name'],
185 185 'id': dataset_dict['id'],
186 186 'path': dataset_dict['path'],
187 187 'last_update': datetime.strptime(dataset_dict['metadata_modified'], "%Y-%m-%dT%H:%M:%S.%f").strftime("%d-%b-%Y %I.%M %p")}
188 188
189 189 resources = dataset_dict.get('resources')
190 190 if resources:
191 191 dp['resources'] = [convert_to_datapackage_resource_change(r)
192 192 for r in resources]
193 193 return dp
194 194
195 195 def convert_to_datapackage_resource_change(resource_dict):
196 196 resource = {}
197 197
198 198 if resource_dict.get('id'):
199 199 resource['id'] = resource_dict['id']
200 200
201 201 if resource_dict.get('name'):
202 202 resource['name'] = resource_dict['name']
203 203
204 204 if resource_dict.get('path'):
205 205 if os.path.isfile(resource_dict['path']):
206 206 resource['path'] = resource_dict['path']
207 207 else:
208 208 resource['url'] = resource_dict['url']
209 209
210 210 schema = resource_dict.get('schema')
211 211 if isinstance(schema, six.string_types):
212 212 try:
213 213 resource['schema'] = json.loads(schema)
214 214 except ValueError:
215 215 resource['schema'] = schema
216 216 elif isinstance(schema, dict):
217 217 resource['schema'] = schema
218 218 return resource
219 219
220 220 def name_no_repetition(name, dir, option=''):
221 221 count = 0
222 222 while True:
223 223 count = count + 1
224 224 if not os.path.exists(os.path.join(dir, name)):
225 225 if option == 'resource':
226 226 return name
227 227 else:
228 228 return os.path.join(dir, name)
229 229
230 230 elif not os.path.exists(os.path.join(dir, '('+str(count)+')'+name)):
231 231 if option == 'resource':
232 232 return '('+str(count)+')'+name
233 233 else:
234 234 return os.path.join(dir, '('+str(count)+')'+name)
235 235 else:
236 236 pass No newline at end of file
@@ -1,17 +1,17
1 1 # encoding: utf-8
2 2 from setuptools import setup
3 3
4 4 setup(
5 5 name = "jrodb",
6 version = "2.9.2.0",
6 version = "2.9.2.1",
7 7 description = "Data Repository - JRO",
8 8 author = "Edson Ynilupu Mattos",
9 9 author_email = "eynilupu@igp.gob.pe",
10 10 url = "http://intranet.igp.gob.pe:8082/DATABASES/ckanext-jro/api-cliente",
11 11 packages = ["jrodb"],
12 12 install_requires = [
13 13 "ckanapi==4.7",
14 14 "requests",
15 15 "tqdm"
16 16 ],
17 17 ) No newline at end of file
General Comments 0
You need to be logged in to leave comments. Login now