-
Notifications
You must be signed in to change notification settings - Fork 722
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
- Loading branch information
Showing
6 changed files
with
243 additions
and
1 deletion.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
110 changes: 110 additions & 0 deletions
110
src/main/scala/com/johnsnowlabs/reader/TextReader.scala
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,110 @@ | ||
/* | ||
* Copyright 2017-2025 John Snow Labs | ||
* | ||
* Licensed under the Apache License, Version 2.0 (the "License"); | ||
* you may not use this file except in compliance with the License. | ||
* You may obtain a copy of the License at | ||
* | ||
* http://www.apache.org/licenses/LICENSE-2.0 | ||
* | ||
* Unless required by applicable law or agreed to in writing, software | ||
* distributed under the License is distributed on an "AS IS" BASIS, | ||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | ||
* See the License for the specific language governing permissions and | ||
* limitations under the License. | ||
*/ | ||
package com.johnsnowlabs.reader | ||
|
||
import com.johnsnowlabs.nlp.util.io.ResourceHelper | ||
import org.apache.spark.sql.DataFrame | ||
import org.apache.spark.sql.functions.udf | ||
|
||
import scala.collection.mutable | ||
|
||
class TextReader(titleLengthSize: Int = 50) extends Serializable { | ||
|
||
private val spark = ResourceHelper.spark | ||
import spark.implicits._ | ||
|
||
/** Parses TXT files and returns a DataFrame. | ||
* | ||
* The DataFrame will contain: | ||
* - "path": the file path, | ||
* - "content": the raw text content, | ||
* - "txt": a Seq[HTMLElement] containing the parsed elements. | ||
*/ | ||
def txt(filePath: String): DataFrame = { | ||
if (ResourceHelper.validFile(filePath)) { | ||
val textFilesRDD = spark.sparkContext.wholeTextFiles(filePath) | ||
textFilesRDD | ||
.toDF("path", "content") | ||
.withColumn("txt", parseTxtUDF($"content")) | ||
} else { | ||
throw new IllegalArgumentException(s"Invalid filePath: $filePath") | ||
} | ||
} | ||
|
||
private val parseTxtUDF = udf((text: String) => parseTxt(text)) | ||
|
||
/** Parses the given text into a sequence of HTMLElements. | ||
* | ||
* Parsing logic: | ||
* - Split the text into blocks using a delimiter of two or more consecutive newlines. | ||
* - Using heuristics, consider a block a title if it is all uppercase and short. | ||
* - If a block is a title candidate and the following block exists and is not a title | ||
* candidate, treat the first as the Title and the second as its NarrativeText. | ||
* - Otherwise, treat blocks as narrative text. | ||
* - Omit any element with empty content. | ||
*/ | ||
private def parseTxt(text: String): Seq[HTMLElement] = { | ||
val blocks = text.split("\\n\\n+").map(_.trim).filter(_.nonEmpty) | ||
val elements = mutable.ArrayBuffer[HTMLElement]() | ||
var i = 0 | ||
while (i < blocks.length) { | ||
val currentBlock = blocks(i) | ||
if (isTitleCandidate(currentBlock)) { | ||
elements += HTMLElement( | ||
"Title", | ||
currentBlock, | ||
mutable.Map("paragraph" -> (i / 2).toString)) | ||
if (i + 1 < blocks.length && !isTitleCandidate(blocks(i + 1))) { | ||
val narrative = blocks(i + 1) | ||
if (narrative.nonEmpty) { | ||
elements += HTMLElement( | ||
"NarrativeText", | ||
narrative, | ||
mutable.Map("paragraph" -> (i / 2).toString)) | ||
} | ||
i += 2 | ||
} else { | ||
i += 1 | ||
} | ||
} else { | ||
elements += HTMLElement( | ||
"NarrativeText", | ||
currentBlock, | ||
mutable.Map("paragraph" -> (i / 2).toString)) | ||
i += 1 | ||
} | ||
} | ||
elements | ||
} | ||
|
||
/** Heuristic function to determine if a given line/block is a title candidate. | ||
* | ||
* Currently, we consider a block a title candidate if: | ||
* - It is non-empty. | ||
* - It consists mostly of uppercase letters (ignoring non-letter characters). | ||
* - It is relatively short (e.g., 50 characters or fewer). | ||
*/ | ||
private def isTitleCandidate(text: String): Boolean = { | ||
val trimmed = text.trim | ||
if (trimmed.isEmpty) return false | ||
val isAllUpper = trimmed.forall(c => !c.isLetter || c.isUpper) | ||
val isTitleCase = trimmed.split("\\s+").forall(word => word.headOption.exists(_.isUpper)) | ||
val isShort = trimmed.length <= 50 | ||
val hasLetters = trimmed.exists(_.isLetter) | ||
(isAllUpper || isTitleCase) && isShort && hasLetters | ||
} | ||
|
||
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,9 @@ | ||
BIG DATA ANALYTICS | ||
|
||
Apache Spark is a fast and general-purpose cluster computing system. | ||
It provides high-level APIs in Java, Scala, Python, and R. | ||
|
||
MACHINE LEARNING | ||
|
||
Spark's MLlib provides scalable machine learning algorithms. | ||
It includes tools for classification, regression, clustering, and more. |
34 changes: 34 additions & 0 deletions
34
src/test/scala/com/johnsnowlabs/reader/TextReaderTest.scala
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,34 @@ | ||
/* | ||
* Copyright 2017-2025 John Snow Labs | ||
* | ||
* Licensed under the Apache License, Version 2.0 (the "License"); | ||
* you may not use this file except in compliance with the License. | ||
* You may obtain a copy of the License at | ||
* | ||
* http://www.apache.org/licenses/LICENSE-2.0 | ||
* | ||
* Unless required by applicable law or agreed to in writing, software | ||
* distributed under the License is distributed on an "AS IS" BASIS, | ||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | ||
* See the License for the specific language governing permissions and | ||
* limitations under the License. | ||
*/ | ||
package com.johnsnowlabs.reader | ||
|
||
import com.johnsnowlabs.tags.FastTest | ||
import org.apache.spark.sql.functions.col | ||
import org.scalatest.flatspec.AnyFlatSpec | ||
|
||
class TextReaderTest extends AnyFlatSpec { | ||
|
||
val txtDirectory = "src/test/resources/reader/txt/" | ||
|
||
"Text Reader" should "read a directory of text files" taggedAs FastTest in { | ||
val textReader = new TextReader() | ||
val textDf = textReader.txt(s"$txtDirectory/simple-text.txt") | ||
textDf.select("txt").show(false) | ||
|
||
assert(!textDf.select(col("txt").getItem(0)).isEmpty) | ||
} | ||
|
||
} |