Ver Fonte

Proper encoding detection and decoding errors handling

Innocenty Enikeew há 10 anos atrás
pai
commit
3b3f5629ea
2 ficheiros alterados com 32 adições e 15 exclusões
  1. 2 1
      rss.lisp
  2. 30 14
      utils.lisp

+ 2 - 1
rss.lisp

@@ -9,7 +9,8 @@
 
 (defun find-rss-links (url)
   (handler-case
-      (multiple-value-bind (root uri) (xml-request url)
+      (multiple-value-bind (root uri encoding) (xml-request url)
+        (declare (ignore encoding))
         (loop for link in (get-by-tag root "link")
            when (string= "application/rss+xml" (plump:attribute link "type"))
            collect (list (plump:attribute link "title")

+ 30 - 14
utils.lisp

@@ -92,25 +92,41 @@ is replaced with replacement."
      nil)))
 
 ;; XML processing
-(defun xml-request (url)
+(defun xml-request (url &optional encoding)
   (multiple-value-bind (raw-body status headers uri http-stream)
       (drakma:http-request (http-default url)
                            :force-binary t
                            :decode-content t)
-    (declare (ignore status headers http-stream))
-    (let* ((dom
-            (handler-case
-                (plump:parse (flex:octets-to-string raw-body :external-format :utf-8))
-              (flex:external-format-encoding-error ()
-                (plump:parse (flex:octets-to-string raw-body)))))
-           (encoding (ignore-errors
-                       (plump:get-attribute (plump:first-child dom) "encoding"))))
+    (declare (ignore status http-stream))
+    (let ((encoding
+           (or
+            ;; 1. Provided encoding
+            encoding
+            ;; 2. Content-type header
+            (ignore-errors
+              (let ((ct (aget :content-type headers)))
+                (subseq ct (1+ (position #\= ct)))))
+            ;; 3. Parse first 1000 bytes
+            (ignore-errors
+              (let ((dom (plump:parse (flex:octets-to-string (subseq raw-body 0 1000)))))
+                (or
+                 ;; 3.1 Content-type from http-equiv
+                 (ignore-errors
+                   (let ((ct (loop for meta in (get-by-tag dom "meta")
+                                for http-equiv = (plump:get-attribute meta "http-equiv")
+                                for content = (plump:get-attribute meta "content")
+                                when (equal http-equiv "Content-Type")
+                                return content)))
+                     (subseq ct (1+ (position #\= ct)))))
+                 ;; 3.2 'content' xml node attribute
+                 (ignore-errors (plump:get-attribute (plump:first-child dom) "encoding")))))
+            ;; 4. Default 'utf-8'
+            "utf-8")))
       (values
-       (if (and encoding (not (equal encoding "utf-8")))
-           (plump:parse (flex:octets-to-string
-                         raw-body
-                         :external-format (intern encoding 'keyword)))
-           dom)
+       (handler-bind ((flex:external-format-encoding-error
+                       (lambda (c) (use-value #\? c))))
+         (plump:parse
+          (flex:octets-to-string raw-body :external-format (intern encoding 'keyword))))
        uri))))
 
 (defun get-by-tag (node tag)