1
2
3
4 """
5 Most of the XSD datatypes are handled directly by RDFLib. However, in some cases, that is not good enough. There are two
6 major reasons for this:
7
8 1. Some datatypes are missing from RDFLib and required by OWL 2 RL and/or RDFS
9 2. In other cases, though the datatype is present, RDFLib is fairly lax in checking the lexical value of those datatypes. Typical case is boolean.
10
11 Some of these deficiencies are handled by this module. All the functions convert the lexical value into a
12 python datatype (or return the original string if this is not possible) which will be used, eg,
13 for comparisons (equalities). If the lexical value constraints are not met, exceptions are raised.
14
15 @requires: U{RDFLib<https://github.com/RDFLib/rdflib>}, 4.0.0 and higher
16 @license: This software is available for use under the U{W3C Software License<http://www.w3.org/Consortium/Legal/2002/copyright-software-20021231>}
17 @organization: U{World Wide Web Consortium<http://www.w3.org>}
18 @author: U{Ivan Herman<a href="http://www.w3.org/People/Ivan/">}
19 """
20
21 __author__ = 'Ivan Herman'
22 __contact__ = 'Ivan Herman, ivan@w3.org'
23 __license__ = u'W3C® SOFTWARE NOTICE AND LICENSE, http://www.w3.org/Consortium/Legal/2002/copyright-software-20021231'
24
25
26 from RDFClosure.RDFS import RDFNS as ns_rdf
27
28 from rdflib.term import XSDToPython, Literal, _toPythonMapping
29
30 from rdflib.namespace import XSD as ns_xsd
31
32 import datetime, time, re
33 from decimal import Decimal
34
35
37 """(Nameless) timezone object. The python datetime object requires timezones as
38 a specific object added to the conversion, rather than the explicit hour and minute
39 difference used by XSD. This class is used to wrap around the hour/minute values.
40 """
42 """
43 @param hours: hour offset
44 @param minutes: minute offset
45 """
46 self.__offset = datetime.timedelta(hours=hours, minutes=minutes)
47 self.__name = "nameless"
48
51
54
56 return datetime.timedelta(0)
57
58
60 """Almost all time/date related methods require the extraction of an optional time zone information.
61 @param incoming_v: the time/date string
62 @return (v,timezone) tuple; 'v' is the input string with the timezone info cut off, 'timezone' is a L{_namelessTZ} instance or None
63 """
64 if incoming_v[-1] == 'Z':
65 v = incoming_v[:-1]
66 tzone = _namelessTZ(0,0)
67 else :
68 pattern = ".*(\+|-)([0-9][0-9]):([0-9][0-9])"
69 match = re.match(pattern,incoming_v)
70 if match is None:
71 v = incoming_v
72 tzone = None
73 else :
74 hours = int(match.groups()[1])
75 if match.groups()[0] == '-':
76 hours = -hours - 1
77 minutes = int(match.groups()[2])
78 v = incoming_v[:-6]
79 tzone = _namelessTZ(hours,minutes)
80 return v, tzone
81
82
83
84
86 """The built-in conversion to boolean is way too lax. The xsd specification requires that only true, false, 1 or 0 should be used...
87 @param v: the literal string defined as boolean
88 @return corresponding boolean value
89 @raise ValueError: invalid boolean values
90 """
91 if v.lower() == "true" or v.lower() == "1":
92 return True
93 elif v.lower() == "false" or v.lower() == "0":
94 return False
95 else :
96 raise ValueError("Invalid boolean literal value %s" % v)
97
98
99
100
102 """The built in datatype handling for RDFLib maps a decimal number to float, but the python version 2.4 and upwards also
103 has a Decimal number. Better make use of that to use very high numbers.
104 However, there is also a big difference between Python's decimal and XSD's decimal, because the latter does not allow
105 for an exponential normal form (why???). This must be filtered out.
106 @param v: the literal string defined as decimal
107 @return Decimal
108 @raise ValueError: invalid decimal value
109 """
110
111 if v.find('E') != -1 or v.find('e') != -1 :
112
113 raise ValueError("Invalid decimal literal value %s" % v)
114 else :
115 return Decimal(v)
116
117
118
119 _hexc = ['A','B','C','D','E','F','a','b','c','d','e','f']
120
121 _numb = ['1','2','3','4','5','6','7','8','9','0']
122
124 """Rudimentary test for the AnyURI value. If it is a relative URI, then some tests are done to filter out
125 mistakes. I am not sure this is the full implementation of the RFC, though, may have to be checked at some point
126 later.
127 @param v: the literal string defined as a URI
128 @return the incoming value
129 @raise ValueError: invalid URI value
130 """
131 import urlparse
132 if len(v) == 0 : return v
133 if urlparse.urlsplit(v)[0] != "":
134
135 return v
136 else :
137
138
139
140
141
142 if v[0] == '%':
143 if len(v) >= 3 and (v[1] in _hexc or v[1] in _numb) and (v[2] in _hexc or v[2] in _numb) :
144 return v
145 else :
146 raise ValueError("Invalid IRI %s" % v)
147 elif v[0] == '?' or v[0] == ':':
148 raise ValueError("Invalid IRI %s" % v)
149 else :
150 return v
151
152
153
155 """Rudimentary test for the base64Binary value. The problem is that the built-in b64 module functions ignore the
156 fact that only a certain family of characters are allowed to appear in the lexical value, so this is checked first.
157 @param v: the literal string defined as a base64encoded string
158 @return the decoded (binary) content
159 @raise ValueError: invalid base 64 binary value
160 """
161 import base64
162 if v.replace('=', 'x').replace('+', 'y').replace('/', 'z').isalnum() :
163 try :
164 return base64.standard_b64decode(v)
165 except :
166 raise ValueError("Invalid Base64Binary %s" % v)
167 else :
168 raise ValueError("Invalid Base64Binary %s" % v)
169
170
171
172 _limits_unsignedByte = [-1, 256]
173
174
175 _limits_byte = [-129, 128]
176
177
178 _limits_unsignedInt = [-1, 4294967296]
179
180
181 _limits_int = [-2147483649, 2147483648]
182
183
184 _limits_unsignedShort = [-1, 65536]
185
186
187 _limits_short = [-32769, 32768]
188
189
190 _limits_unsignedLong = [-1, 18446744073709551616]
191
192
193 _limits_long = [-9223372036854775809, 9223372036854775808]
194
195
196 _limits_positiveInteger = [0, None]
197
198
199 _limits_nonPositiveInteger = [None, 1]
200
201
202 _limits_nonNegativeInteger = [-1, None]
203
204
205 _limits_negativeInteger = [None, 0]
206
207
209 """Test (and convert) a generic numerical type, with a check against a lower and upper limit.
210 @param v: the literal string to be converted
211 @param interval: lower and upper bounds (non inclusive). If the value is None, no comparison should be done
212 @param conversion: conversion function, ie, int, long, etc
213 @raise ValueError: invalid value
214 """
215 try:
216 i = conversion(v)
217 if (interval[0] is None or interval[0] < i) and (interval[1] is None or i < interval[1]) :
218 return i
219 except:
220 pass
221 raise ValueError("Invalid numerical value %s" % v)
222
223
224
226 """Test and convert a double value into a Decimal or float. Raises an exception if the number is outside the permitted
227 range, ie, 1.0E+310 and 1.0E-330. To be on the safe side (python does not have double!) Decimals are used
228 if possible. Upper and lower values, as required by xsd, are checked (and these fixed values are the reasons
229 why Decimal is used!)
230
231 @param v: the literal string defined as a double
232 @return Decimal
233 @raise ValueError: invalid value
234 """
235 try :
236 value = Decimal(v)
237 upper = Decimal("1.0E+310")
238 lower = Decimal("1.0E-330")
239 if lower < abs(value) < upper :
240
241 return value
242 else :
243 raise ValueError("Invalid double %s" % v)
244 except :
245
246 raise ValueError("Invalid double %s" % v)
247
248
250 """Test and convert a float value into Decimal or (python) float. Raises an exception if the number is outside the
251 permitted range, ie, 1.0E+40 and 1.0E-50. (And these fixed values are the reasons why Decimal is used!)
252
253 @param v: the literal string defined as a float
254 @return Decimal if the local python version is >= 2.4, float otherwise
255 @raise ValueError: invalid value
256 """
257 try :
258 value = Decimal(v)
259 upper = Decimal("1.0E+40")
260 lower = Decimal("1.0E-50")
261 if lower < abs(value) < upper :
262
263 return value
264 else :
265 raise ValueError("Invalid float %s" % v)
266 except :
267
268 raise ValueError("Invalid float %s" % v)
269
270
271
273 """Test (and convert) hexa integer values. The number of characters should be even.
274 @param v: the literal string defined as a hexa number
275 @return long value
276 @raise ValueError: invalid value
277 """
278
279 length = len(v)
280 if (length / 2) * 2 != length :
281 raise ValueError("Invalid hex binary number %s" % v)
282 return long(v, 16)
283
284
285
286
288 """Test (and convert) datetime and date timestamp values.
289 @param incoming_v: the literal string defined as the date and time
290 @param timezone_required: whether the timezone is required (ie, for date timestamp) or not
291 @return datetime
292 @rtype: datetime.datetime
293 @raise ValueError: invalid datetime or date timestamp
294 """
295
296
297 (v, tzone) = _returnTimeZone(incoming_v)
298
299
300 if timezone_required and tzone is None :
301 raise ValueError("Invalid datetime %s" % incoming_v)
302
303
304 final_v = v
305 milliseconds = 0
306 milpattern = "(.*)(\.)([0-9]*)"
307 match = re.match(milpattern, v)
308 if match is not None :
309
310 try :
311 final_v = match.groups()[0]
312 milliseconds = int(match.groups()[2])
313 except :
314 raise ValueError("Invalid datetime %s" % incoming_v)
315
316
317
318 try :
319 tstr = time.strptime(final_v, "%Y-%m-%dT%H:%M:%S")
320 if tzone is not None :
321 return datetime.datetime(tstr.tm_year, tstr.tm_mon, tstr.tm_mday, tstr.tm_hour, tstr.tm_min, tstr.tm_sec, milliseconds, tzone)
322 else :
323 return datetime.datetime(tstr.tm_year, tstr.tm_mon, tstr.tm_mday, tstr.tm_hour, tstr.tm_min, tstr.tm_sec, milliseconds)
324 except :
325 raise ValueError("Invalid datetime %s" % incoming_v)
326
327
329 """Test (and convert) time values.
330 @param incoming_v: the literal string defined as time value
331 @return time
332 @rtype datetime.time
333 @raise ValueError: invalid datetime or date timestamp
334 """
335
336
337 (v, tzone) = _returnTimeZone(incoming_v)
338
339
340 final_v = v
341 milliseconds = 0
342 milpattern = "(.*)(\.)([0-9]*)"
343 match = re.match(milpattern, v)
344 if match is not None:
345
346 try :
347 final_v = match.groups()[0]
348 milliseconds = int(match.groups()[2])
349 except :
350 raise ValueError("Invalid datetime %s" % incoming_v)
351
352
353
354 try :
355 tstr = time.strptime(final_v,"%H:%M:%S")
356 if tzone is not None:
357 return datetime.time(tstr.tm_hour, tstr.tm_min, tstr.tm_sec, milliseconds, tzone)
358 else :
359 return datetime.time(tstr.tm_hour, tstr.tm_min, tstr.tm_sec, milliseconds)
360 except :
361 raise ValueError("Invalid time %s" % incoming_v)
362
363
365 """Test (and convert) date values.
366 @param incoming_v: the literal string defined as date (in iso format)
367 @return date
368 @return datetime.date
369 @raise ValueError: invalid datetime or date timestamp
370 """
371
372
373 (final_v, tzone) = _returnTimeZone(incoming_v)
374
375
376 try :
377 tstr = time.strptime(final_v,"%Y-%m-%d")
378 return datetime.date(tstr.tm_year, tstr.tm_mon, tstr.tm_mday)
379 except :
380 raise ValueError("Invalid date %s" % incoming_v)
381
382
383
384
385
386
388 """Test gYearMonth value
389 @param v: the literal string
390 @return v
391 @raise ValueError: invalid value
392 """
393 try :
394 time.strptime(v+"-01", "%Y-%m-%d")
395 return v
396 except :
397 raise ValueError("Invalid gYearMonth %s" % v)
398
399
401 """Test gYear value
402 @param v: the literal string
403 @return v
404 @raise ValueError: invalid value
405 """
406 try :
407 time.strptime(v+"-01-01", "%Y-%m-%d")
408 return v
409 except :
410 raise ValueError("Invalid gYear %s" % v)
411
412
414 """Test gYearMonth value
415 @param v: the literal string
416 @return v
417 @raise ValueError: invalid value
418 """
419 try :
420 time.strptime("2008-" + v, "%Y-%m-%d")
421 return v
422 except :
423 raise ValueError("Invalid gMonthDay %s" % v)
424
425
427 """Test gYearMonth value
428 @param v: the literal string
429 @return v
430 @raise ValueError: invalid value
431 """
432 try :
433 time.strptime("2001-01-" + v, "%Y-%m-%d")
434 return v
435 except :
436 raise ValueError("Invalid gDay %s" % v)
437
438
440 """Test gYearMonth value
441 @param v: the literal string
442 @return v
443 @raise ValueError: invalid value
444 """
445 try :
446 time.strptime("2001-" + v + "-01", "%Y-%m-%d")
447 return v
448 except :
449 raise ValueError("Invalid gMonth %s" % v)
450
451
452
454 """Test (and convert) XML Literal values.
455 @param v: the literal string defined as an xml literal
456 @return the canonical version of the same xml text
457 @raise ValueError: incorrect xml string
458 """
459 import xml.dom.minidom
460 try :
461 dom = xml.dom.minidom.parseString(v)
462 return dom.toxml()
463 except :
464 raise ValueError("Invalid XML Literal %s" % v)
465
466
467
468 _re_language = "[a-zA-Z]{1,8}(-[a-zA-Z0-9]{1,8})*"
469
470
471 _re_NMTOKEN = "[\w:_.\-]+"
472
473
474 _re_Name_ex = ['.','-'] + _numb
475
476
477 _re_NCName = "[\w_.\-]+"
478
479
480 _re_NCName_ex = ['.','-'] + _numb
481
482
484 """Test (and convert) a generic string type, with a check against a regular expression.
485 @param v: the literal string to be converted
486 @param regexp: the regular expression to check against
487 @param flag: flags to be used in the regular expression
488 @param excludeStart: array of characters disallowed in the first position
489 @return original string
490 @raise ValueError: invalid value
491 """
492 match = re.match(regexp, v, flag)
493 if match is None or match.end() != len(v):
494 raise ValueError("Invalid literal %s" % v)
495 else :
496 if len(excludeStart) > 0 and v[0] in excludeStart:
497 raise ValueError("Invalid literal %s" % v)
498 return v
499
500
501 _re_token = "[^\n\t\r]+"
502
503
505 """Test (and convert) a string to a token.
506 @param v: the literal string to be converted
507 @return original string
508 @raise ValueError: invalid value
509 """
510 if len(v) == 0:
511 return v
512
513 _strToVal_Regexp(v, _re_token)
514 v1 = ' '.join(v.strip().split())
515
516 if len(v1) == len(v):
517
518 return v
519 else :
520 raise ValueError("Invalid literal %s" % v)
521
522
523
525 """Test (and convert) a plain literal
526 @param v: the literal to be converted
527 @return a new RDFLib Literal with language tag
528 @raise ValueError: invalid value
529 """
530 reg = "(.*)@([^@]*)"
531
532 match = re.match(reg,v)
533 if match is None :
534 raise ValueError("Invalid plain literal %s" % v)
535 else :
536 lit = match.groups()[0]
537 if len(match.groups()) == 1 or match.groups()[1] == "" :
538
539 return Literal(lit)
540 else :
541 lang = match.groups()[1]
542
543 try :
544 lang = _strToVal_Regexp(lang, _re_language)
545 return Literal(lit,lang=lang.lower())
546 except :
547 raise ValueError("Invalid plain literal %s" % v)
548
549
550
551 AltXSDToPYTHON = {
552 ns_xsd["language"] : lambda v: _strToVal_Regexp(v, _re_language),
553 ns_xsd["NMTOKEN"] : lambda v: _strToVal_Regexp(v, _re_NMTOKEN, re.U),
554 ns_xsd["Name"] : lambda v: _strToVal_Regexp(v, _re_NMTOKEN, re.U, _re_Name_ex),
555 ns_xsd["NCName"] : lambda v: _strToVal_Regexp(v, _re_NCName, re.U, _re_NCName_ex),
556 ns_xsd["token"] : _strToToken,
557 ns_rdf["PlainLiteral"] : _strToPlainLiteral,
558 ns_xsd["boolean"] : _strToBool,
559 ns_xsd["decimal"] : _strToDecimal,
560 ns_xsd["anyURI"] : _strToAnyURI,
561 ns_xsd["base64Binary"] : _strToBase64Binary,
562 ns_xsd["double"] : _strToDouble,
563 ns_xsd["float"] : _strToFloat,
564 ns_xsd["byte"] : lambda v: _strToBoundNumeral(v, _limits_byte, int),
565 ns_xsd["int"] : lambda v: _strToBoundNumeral(v, _limits_int, long),
566 ns_xsd["long"] : lambda v: _strToBoundNumeral(v, _limits_long, long),
567 ns_xsd["positiveInteger"] : lambda v: _strToBoundNumeral(v, _limits_positiveInteger, long),
568 ns_xsd["nonPositiveInteger"] : lambda v: _strToBoundNumeral(v, _limits_nonPositiveInteger, long),
569 ns_xsd["negativeInteger"] : lambda v: _strToBoundNumeral(v, _limits_negativeInteger, long),
570 ns_xsd["nonNegativeInteger"] : lambda v: _strToBoundNumeral(v, _limits_nonNegativeInteger, long),
571 ns_xsd["short"] : lambda v: _strToBoundNumeral(v, _limits_short, int),
572 ns_xsd["unsignedByte"] : lambda v: _strToBoundNumeral(v, _limits_unsignedByte, int),
573 ns_xsd["unsignedShort"] : lambda v: _strToBoundNumeral(v, _limits_unsignedShort, int),
574 ns_xsd["unsignedInt"] : lambda v: _strToBoundNumeral(v, _limits_unsignedInt, long),
575 ns_xsd["unsignedLong"] : lambda v: _strToBoundNumeral(v, _limits_unsignedLong, long),
576 ns_xsd["hexBinary"] : _strToHexBinary,
577 ns_xsd["dateTime"] : lambda v: _strToDateTimeAndStamp(v, False),
578 ns_xsd["dateTimeStamp"] : lambda v: _strToDateTimeAndStamp(v, True),
579 ns_rdf["XMLLiteral"] : _strToXMLLiteral,
580 ns_xsd["integer"] : long,
581 ns_xsd["string"] : lambda v: v,
582 ns_rdf["HTML"] : lambda v: v,
583 ns_xsd["normalizedString"] : lambda v: _strToVal_Regexp(v, _re_token),
584
585
586 ns_xsd["time"] : _strToTime,
587 ns_xsd["date"] : _strToDate,
588 ns_xsd["gYearMonth"] : _strTogYearMonth,
589 ns_xsd["gYear"] : _strTogYear,
590 ns_xsd["gMonthDay"] : _strTogMonthDay,
591 ns_xsd["gDay"] : _strTogDay,
592 ns_xsd["gMonth"] : _strTogMonth,
593 }
594
595
597 """Registering the datatypes item for RDFLib, ie, bind the dictionary values. The 'bind' method of RDFLib adds
598 extra datatypes to the registered ones in RDFLib, though the table used here (ie, L{AltXSDToPYTHON}) actually overrides
599 all of the default conversion routines. The method also add a Decimal entry to the PythonToXSD array of RDFLib.
600 """
601 _toPythonMapping.update(AltXSDToPYTHON)
602
603
605 """Restore the original (ie, RDFLib) set of lexical conversion routines.
606 """
607 _toPythonMapping.update(XSDToPython)
608
609
610
611 if __name__ == '__main__' :
612 import sys
613 dtype = sys.argv[1]
614 string = sys.argv[2]
615 datatype = ns_xsd[dtype]
616 result = AltXSDToPYTHON[datatype](string)
617 print type(result)
618 print result
619