Skip to content

Commit

Permalink
Merge pull request #2813 from omnivore-app/fix/published-date
Browse files Browse the repository at this point in the history
get published date from url and time elements
  • Loading branch information
sywhb committed Sep 28, 2023
2 parents c9b178c + 2b23b0e commit cd8f47a
Show file tree
Hide file tree
Showing 10 changed files with 2,393 additions and 7 deletions.
36 changes: 34 additions & 2 deletions packages/readabilityjs/Readability.js
Original file line number Diff line number Diff line change
Expand Up @@ -76,6 +76,22 @@ const extractPublishedDateFromAuthor = (author)=> {
return [authorName, null];
};

// extract published date from url if it's in the format of yyyy/mm/dd or yyyy-mm-dd
const extractPublishedDateFromUrl = (url) => {
if (!url) return null;

const regex = /(\d{4})(\/|-)(\d{2})(\/|-)(\d{2})/i;
const match = url.match(regex);
if (match) {
const year = parseInt(match[1], 10);
const month = parseInt(match[3], 10) - 1; // January is 0 in JavaScript Date
const day = parseInt(match[5], 10);

return new Date(year, month, day);
}
return null;
}

/**
* Public constructor.
* @param {Document} doc The document to parse.
Expand Down Expand Up @@ -1081,6 +1097,18 @@ Readability.prototype = {
}
// we don't want to check for dates in the URL's
if (node.tagName.toLowerCase() === 'a') return
// get the datetime from time element
if (node.tagName.toLowerCase() === 'time') {
const datetime = node.getAttribute('datetime')
if (datetime) {
const date = new Date(datetime)
if (!isNaN(date)) {
this._articlePublishedDate = date
return true
}
}
}

// Searching for the real date in the text content
const content = node.textContent.trim()
let dateFound
Expand Down Expand Up @@ -3056,7 +3084,11 @@ Readability.prototype = {
return null;

const byline = metadata.byline || this._articleByline;
const [author, publishedAt] = extractPublishedDateFromAuthor(byline);
const [author, publishedDateFromAuthor] = extractPublishedDateFromAuthor(byline);
const publishedDate = metadata.publishedDate ||
extractPublishedDateFromUrl(this._documentURI) ||
publishedDateFromAuthor ||
this._articlePublishedDate;

this._postProcessContent(articleContent);

Expand Down Expand Up @@ -3092,7 +3124,7 @@ Readability.prototype = {
siteName: metadata.siteName,
siteIcon: metadata.siteIcon,
previewImage: metadata.previewImage,
publishedDate: metadata.publishedDate || publishedAt || this._articlePublishedDate,
publishedDate,
language: this._getLanguage(metadata.locale || this._languageCode),
};
}
Expand Down
6 changes: 6 additions & 0 deletions packages/readabilityjs/test/index.html
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,12 @@
<a href="./test-pages/electrek/distiller.html" target="iframe_b">[dom-distiller]</a>
</li>

<li>caixin<br />
<a href="./test-pages/caixin/source.html" target="iframe_b">[source]</a>
<a href="./test-pages/caixin/expected.html" target="iframe_b">[readability]</a>
<a href="./test-pages/caixin/distiller.html" target="iframe_b">[dom-distiller]</a>
</li>

<li>news.utexas<br />
<a href="./test-pages/news.utexas/source.html" target="iframe_b">[source]</a>
<a href="./test-pages/news.utexas/expected.html" target="iframe_b">[readability]</a>
Expand Down
20 changes: 20 additions & 0 deletions packages/readabilityjs/test/test-pages/caixin/distiller.html
Original file line number Diff line number Diff line change
@@ -0,0 +1,20 @@
<div><p>
  <b>【财新网】</b>9月26日,汽车服务平台<a href="https://s.ccxe.com.cn/entities/companies/202035144" target="_blank">途虎养车</a>正式在港交所主板挂牌上市。途虎养车( <a href="09690.HKM">09690.HK</a> )上市发行价为28港元/股,此前公司披露的发行价区间为28港元/股至31港元/股,即实际发行价为区间下限。当日,途虎养车收报29.5港元/股,较发行价涨5.36%,市值为239.6亿港元。
</p><p>
  途虎养车上市不易。途虎养车2022年1月即在港交所递表,2022年8月、2023年3月两次重新递交上市申请材料,终于在2023年8月23日通过聆讯。
</p><img src="https://file.caixin.com/images/m/lineContent.png"/><div>


</div><div>
后获取已订阅的阅读权限
</div><div>
财新通会员<br/>
可畅读全文
</div><div>
</div><img src="https://showimg.caixin.com/dolphinfile/caixin/2023/06/15688_1_16865563181267.jpg"/><img src="https://file.caixin.com/static/mh5/images/conponclose.png"/><p>
  推荐进入<a href="https://cxdata.caixin.com/index" target="_blank">财新数据库</a>,可随时查阅公司股价走势、结构人员变化等投资信息。
</p><div>
责任编辑:屈运栩 | 版面编辑:刘潇(ZN028)
</div><dt>
话题:
</dt></div>
Original file line number Diff line number Diff line change
@@ -0,0 +1,11 @@
{
"title": "途虎养车港交所挂牌 腾讯为最大外部股东",
"byline": "文|财新 余聪",
"dir": null,
"excerpt": "途虎养车 腾讯国内汽车服务市场高度分散,2022年,途虎养车取得汽车服务收入115亿元,市场份额0.9%",
"siteName": "fakehost",
"previewImage": "https://img.caixin.com/2023-09-26/169572084568190_560_373.jpg",
"publishedDate": "2023-09-25T16:00:00.000Z",
"language": "English",
"readerable": true
}
45 changes: 45 additions & 0 deletions packages/readabilityjs/test/test-pages/caixin/expected.html
Original file line number Diff line number Diff line change
@@ -0,0 +1,45 @@
<DIV class="page" id="readability-page-1">
<div id="the_content">
<div id="conTit">
<h2> 途虎养车港交所挂牌 腾讯为最大外部股东 <img src="https://file.caixin.com/webchannel/all/img/icon_key.png">
</h2>
<!--baidu begin-->
<!--baidu end-->
<div id="artInfo">
<!-- tt.s 12/17 -->
<p> 文|财新 余聪 </p><!-- tt.e 12/17 -->
<p> 2023年09月26日 17:22 </p>
<!-- tt.s 12/17 来源于
<a href=http://www.caixin.com target=_blank>财新网</a>
tt.e 12/17-->
<!-- 新版音频播放器代码 begin -->
<!-- 新版音频播放器代码 end -->
<img id="swit" height="26" src="http://file.caixin.com/images/content/PC.jpg"> 试听
</div>
<p> 国内汽车服务市场高度分散,2022年,途虎养车取得汽车服务收入115亿元,市场份额0.9% </p>
</div>
<div id="Main_Content_Val">
<p>   <b>【财新网】</b>9月26日,汽车服务平台<a href="https://s.ccxe.com.cn/entities/companies/202035144" target="_blank">途虎养车</a>正式在港交所主板挂牌上市。途虎养车( <a onclick="return false" href="http://fakehost/test/09690.HKM">09690.HK</a> )上市发行价为28港元/股,此前公司披露的发行价区间为28港元/股至31港元/股,即实际发行价为区间下限。当日,途虎养车收报29.5港元/股,较发行价涨5.36%,市值为239.6亿港元。 </p>
<p>   途虎养车上市不易。途虎养车2022年1月即在港交所递表,2022年8月、2023年3月两次重新递交上市申请材料,终于在2023年8月23日通过聆讯。 </p>
</div>
<!--杂志购买 begin-->
<!--全站公用文章页收费框碎片-->
<div id="chargeWall">
<p><img src="https://file.caixin.com/images/m/lineContent.png"></p>
<p>登录 后获取已订阅的阅读权限 </p>
<!---->
<!---->
<!---->
<!---->
<!---->
<!---->
</div>
<div id="pay-box">
<p><img src="https://file.caixin.com/static/mh5/images/conponclose.png"></p>
</div>
<!--<script src="//file.caixin.com/pkg/cx-pay-layer/js/chunk-vendors.js"></script>-->
<!--杂志购买 end-->
<p>   推荐进入<a href="https://cxdata.caixin.com/index" target="_blank">财新数据库</a>,可随时查阅公司股价走势、结构人员变化等投资信息。 </p>
<p> 责任编辑:屈运栩 | 版面编辑:刘潇(ZN028) </p>
</div>
</DIV>
Loading

1 comment on commit cd8f47a

@vercel
Copy link

@vercel vercel bot commented on cd8f47a Sep 28, 2023

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Please sign in to comment.