diff --git a/packages/readabilityjs/Readability.js b/packages/readabilityjs/Readability.js index abdbfa8d0e..2934841bd4 100644 --- a/packages/readabilityjs/Readability.js +++ b/packages/readabilityjs/Readability.js @@ -76,6 +76,22 @@ const extractPublishedDateFromAuthor = (author)=> { return [authorName, null]; }; +// extract published date from url if it's in the format of yyyy/mm/dd or yyyy-mm-dd +const extractPublishedDateFromUrl = (url) => { + if (!url) return null; + + const regex = /(\d{4})(\/|-)(\d{2})(\/|-)(\d{2})/i; + const match = url.match(regex); + if (match) { + const year = parseInt(match[1], 10); + const month = parseInt(match[3], 10) - 1; // January is 0 in JavaScript Date + const day = parseInt(match[5], 10); + + return new Date(year, month, day); + } + return null; +} + /** * Public constructor. * @param {Document} doc The document to parse. @@ -1081,6 +1097,18 @@ Readability.prototype = { } // we don't want to check for dates in the URL's if (node.tagName.toLowerCase() === 'a') return + // get the datetime from time element + if (node.tagName.toLowerCase() === 'time') { + const datetime = node.getAttribute('datetime') + if (datetime) { + const date = new Date(datetime) + if (!isNaN(date)) { + this._articlePublishedDate = date + return true + } + } + } + // Searching for the real date in the text content const content = node.textContent.trim() let dateFound @@ -3056,7 +3084,11 @@ Readability.prototype = { return null; const byline = metadata.byline || this._articleByline; - const [author, publishedAt] = extractPublishedDateFromAuthor(byline); + const [author, publishedDateFromAuthor] = extractPublishedDateFromAuthor(byline); + const publishedDate = metadata.publishedDate || + extractPublishedDateFromUrl(this._documentURI) || + publishedDateFromAuthor || + this._articlePublishedDate; this._postProcessContent(articleContent); @@ -3092,7 +3124,7 @@ Readability.prototype = { siteName: metadata.siteName, siteIcon: metadata.siteIcon, previewImage: metadata.previewImage, - publishedDate: metadata.publishedDate || publishedAt || this._articlePublishedDate, + publishedDate, language: this._getLanguage(metadata.locale || this._languageCode), }; } diff --git a/packages/readabilityjs/test/index.html b/packages/readabilityjs/test/index.html index 22cb8dbef3..e997271df7 100644 --- a/packages/readabilityjs/test/index.html +++ b/packages/readabilityjs/test/index.html @@ -20,6 +20,12 @@ [dom-distiller] +
  • caixin
    + [source] + [readability] + [dom-distiller] +
  • +
  • news.utexas
    [source] [readability] diff --git a/packages/readabilityjs/test/test-pages/caixin/distiller.html b/packages/readabilityjs/test/test-pages/caixin/distiller.html new file mode 100644 index 0000000000..adae61ed12 --- /dev/null +++ b/packages/readabilityjs/test/test-pages/caixin/distiller.html @@ -0,0 +1,20 @@ +

    +   【财新网】9月26日,汽车服务平台途虎养车正式在港交所主板挂牌上市。途虎养车( 09690.HK )上市发行价为28港元/股,此前公司披露的发行价区间为28港元/股至31港元/股,即实际发行价为区间下限。当日,途虎养车收报29.5港元/股,较发行价涨5.36%,市值为239.6亿港元。 +

    +   途虎养车上市不易。途虎养车2022年1月即在港交所递表,2022年8月、2023年3月两次重新递交上市申请材料,终于在2023年8月23日通过聆讯。 +

    + + +
    + 后获取已订阅的阅读权限 +
    + 财新通会员
    + 可畅读全文 +
    +

    +   推荐进入财新数据库,可随时查阅公司股价走势、结构人员变化等投资信息。 +

    + 责任编辑:屈运栩 | 版面编辑:刘潇(ZN028) +
    + 话题: +
    \ No newline at end of file diff --git a/packages/readabilityjs/test/test-pages/caixin/expected-metadata.json b/packages/readabilityjs/test/test-pages/caixin/expected-metadata.json new file mode 100644 index 0000000000..b974abcc29 --- /dev/null +++ b/packages/readabilityjs/test/test-pages/caixin/expected-metadata.json @@ -0,0 +1,11 @@ +{ + "title": "途虎养车港交所挂牌 腾讯为最大外部股东", + "byline": "文|财新 余聪", + "dir": null, + "excerpt": "途虎养车 腾讯国内汽车服务市场高度分散,2022年,途虎养车取得汽车服务收入115亿元,市场份额0.9%", + "siteName": "fakehost", + "previewImage": "https://img.caixin.com/2023-09-26/169572084568190_560_373.jpg", + "publishedDate": "2023-09-25T16:00:00.000Z", + "language": "English", + "readerable": true +} diff --git a/packages/readabilityjs/test/test-pages/caixin/expected.html b/packages/readabilityjs/test/test-pages/caixin/expected.html new file mode 100644 index 0000000000..1d85d1b14e --- /dev/null +++ b/packages/readabilityjs/test/test-pages/caixin/expected.html @@ -0,0 +1,45 @@ +
    +
    +
    +

    途虎养车港交所挂牌 腾讯为最大外部股东 +

    + + +
    + +

    文|财新 余聪

    +

    2023年09月26日 17:22

    + + + + 试听 +
    +

    国内汽车服务市场高度分散,2022年,途虎养车取得汽车服务收入115亿元,市场份额0.9%

    +
    +
    +

      【财新网】9月26日,汽车服务平台途虎养车正式在港交所主板挂牌上市。途虎养车( 09690.HK )上市发行价为28港元/股,此前公司披露的发行价区间为28港元/股至31港元/股,即实际发行价为区间下限。当日,途虎养车收报29.5港元/股,较发行价涨5.36%,市值为239.6亿港元。

    +

      途虎养车上市不易。途虎养车2022年1月即在港交所递表,2022年8月、2023年3月两次重新递交上市申请材料,终于在2023年8月23日通过聆讯。

    +
    + + +
    +

    +

    登录 后获取已订阅的阅读权限

    + + + + + + +
    +
    +

    +
    + + +

      推荐进入财新数据库,可随时查阅公司股价走势、结构人员变化等投资信息。

    +

    责任编辑:屈运栩 | 版面编辑:刘潇(ZN028)

    +
    +
    diff --git a/packages/readabilityjs/test/test-pages/caixin/source.html b/packages/readabilityjs/test/test-pages/caixin/source.html new file mode 100644 index 0000000000..cb9392fb1e --- /dev/null +++ b/packages/readabilityjs/test/test-pages/caixin/source.html @@ -0,0 +1,2275 @@ + + + + + + + + + + + + 途虎养车港交所挂牌 腾讯为最大外部股东_财新网_财新网 + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
    + +
    +
    + + +
    +
    +
    + + +
    财新传媒 + + +
    +
    +
    +
    + 财新网 > 汽车 > 正文 +
    + +
    + +
    +
    + +
    +
    + + +
    + +
    + +
    +
    +
    + +
    + +
    +
    + +
    + + + + +
    +
    +

    + 途虎养车港交所挂牌 腾讯为最大外部股东 +

    + +
    + +
    + 文|财新 余聪 +
    + 2023年09月26日 17:22 + + + 试听 +
    +
    + 国内汽车服务市场高度分散,2022年,途虎养车取得汽车服务收入115亿元,市场份额0.9% +
    +
    +
    +
    +
    + +
    +
    + 上海,一处途虎养车门店。途虎养车2022年1月即在港交所递表,2022年8月、2023年3月两次重新递交上市申请材料,终于在2023年8月23日通过聆讯。图:Qilai Shen/视觉中国 +
    +
    +
    +
    + + +
    + + +
    +

    +   【财新网】9月26日,汽车服务平台途虎养车正式在港交所主板挂牌上市。途虎养车( 09690.HK )上市发行价为28港元/股,此前公司披露的发行价区间为28港元/股至31港元/股,即实际发行价为区间下限。当日,途虎养车收报29.5港元/股,较发行价涨5.36%,市值为239.6亿港元。 +

    +

    +   途虎养车上市不易。途虎养车2022年1月即在港交所递表,2022年8月、2023年3月两次重新递交上市申请材料,终于在2023年8月23日通过聆讯。 +

    +
    +
    +
    +
    + + +
    +
    + +
    +
    + + +
    + 登录 后获取已订阅的阅读权限 +
    +
    +
    + 财新通会员
    + 可畅读全文 +
    订阅/会员升级 +
    +
    +
    +
    +
    +
    + 请朋友免费读财新 +
    +
    +
    +
    + +
    +
    +
    + + + + +
    +
    +
    + + +
    + +
    +
    + +
    +
    + +
    +
    +
    +
    +
    +
    + +
    + +
    +

    +   推荐进入财新数据库,可随时查阅公司股价走势、结构人员变化等投资信息。 +

    +
    +
    +
    + 责任编辑:屈运栩 | 版面编辑:刘潇(ZN028) +
    + + +
    + +
    + +
    +
    + 话题: +
    +
    + #港交所+关注 +
    +
    + #腾讯+关注 +
    +
    + #京东+关注 +
    +
    +
    + +
    + +
    + +
    + +
    +
    + + + +
    + +
    + + + + +
    + + +
    + +
    + +
    +

    + 图片推荐 +

    +
    + + +
    + +
    +
    + +
    +
    + + + + +
    + +
    +
    + +
    + +
    + +
    + +
    +
    + +
    +
    + + +
    +
    +
    + 财新网主编精选版电邮 + 样例 +
    +
    + 财新网新闻版电邮全新升级!财新网主编精心编写,每个工作日定时投递,篇篇重磅,可信可引。 +
    +
    + 订阅 +
    +
    +
    + + + +
    + + + + + + +
    +

    + 视频 +

    +
    + +
    +
     + + + + +
    +
    +
    + +
    +
    + + + + + + +
    + + + + + + + + + + + + + + + + +
    +
    +
    +

    + +

    +
    + +
    +
    + +
    + + diff --git a/packages/readabilityjs/test/test-pages/caixin/url.txt b/packages/readabilityjs/test/test-pages/caixin/url.txt new file mode 100644 index 0000000000..dbae9da787 --- /dev/null +++ b/packages/readabilityjs/test/test-pages/caixin/url.txt @@ -0,0 +1 @@ +https://www.caixin.com/2023-09-26/102112537.html \ No newline at end of file diff --git a/packages/readabilityjs/test/test-pages/nytimes-podcasts/expected-metadata.json b/packages/readabilityjs/test/test-pages/nytimes-podcasts/expected-metadata.json index 61fe09886c..d2bc065e6d 100644 --- a/packages/readabilityjs/test/test-pages/nytimes-podcasts/expected-metadata.json +++ b/packages/readabilityjs/test/test-pages/nytimes-podcasts/expected-metadata.json @@ -4,7 +4,7 @@ "dir": null, "excerpt": "The Sept. 27, 2022 episode of “The Ezra Klein Show”", "siteName": "fakehost", - "siteIcon": "/vi-assets/static-assets/favicon-d2483f10ef688e6f89e23806b9700298.ico", + "siteIcon": "http://fakehost/vi-assets/static-assets/favicon-d2483f10ef688e6f89e23806b9700298.ico", "previewImage": "https://static01.nyt.com/newsgraphics/images/icons/defaultPromoCrop.png", "publishedDate": "2022-09-27T16:25:17.221Z", "language": "English", diff --git a/packages/readabilityjs/test/test-pages/nytimes-podcasts/expected.html b/packages/readabilityjs/test/test-pages/nytimes-podcasts/expected.html index 8d79daf8c2..725bef0943 100644 --- a/packages/readabilityjs/test/test-pages/nytimes-podcasts/expected.html +++ b/packages/readabilityjs/test/test-pages/nytimes-podcasts/expected.html @@ -5,8 +5,6 @@

    The Ezra Klein Show

    -

    -

    diff --git a/packages/readabilityjs/test/test-pages/nytimes.com/expected.html b/packages/readabilityjs/test/test-pages/nytimes.com/expected.html index ceb0aa925a..9777d24a69 100644 --- a/packages/readabilityjs/test/test-pages/nytimes.com/expected.html +++ b/packages/readabilityjs/test/test-pages/nytimes.com/expected.html @@ -30,8 +30,6 @@
    -

    -